diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000..585a78c --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,47 @@ +{ + "files": [ + "README.md" + ], + "imageSize": 100, + "commit": false, + "commitType": "docs", + "commitConvention": "angular", + "contributors": [ + { + "login": "danielawitten", + "name": "danielawitten", + "avatar_url": "https://avatars.githubusercontent.com/u/12654191?v=4", + "profile": "https://github.com/danielawitten", + "contributions": [ + "code", + "content" + ] + }, + { + "login": "trevorhastie", + "name": "trevorhastie", + "avatar_url": "https://avatars.githubusercontent.com/u/13293253?v=4", + "profile": "https://web.stanford.edu/~hastie/", + "contributions": [ + "code", + "content" + ] + }, + { + "login": "tibshirani", + "name": "tibshirani", + "avatar_url": "https://avatars.githubusercontent.com/u/2848609?v=4", + "profile": "https://github.com/tibshirani", + "contributions": [ + "code", + "content" + ] + } + ], + "contributorsPerLine": 7, + "skipCi": true, + "repoType": "github", + "repoHost": "https://github.com", + "projectName": "ISLP", + "projectOwner": "intro-stat-learning" +} diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml new file mode 100644 index 0000000..9260015 --- /dev/null +++ b/.github/workflows/build_docs.yml @@ -0,0 +1,85 @@ +# This builds and deploys ISLP docs + +name: Build docs + +# Controls when the workflow will run +on: + workflow_dispatch: + inputs: null + +# A workflow run is made up of one or more jobs that can run +# sequentially or in parallel + +jobs: # This workflow contains a single + # job called "build" + + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.12' + cache: 'pip' + # Install + - name: Install dependencies + run: | + sudo apt-get install r-base + pip install -r docs/requirements.txt + pip install . + + # Checkout labs + - name: Checkout version of labs + run: | + git submodule update --init --force docs/ISLP_labs + cd docs + mkdir -p source/labs + cp ISLP_labs/Ch*nb source/labs + python fix_and_clear_notebooks.py source/labs/Ch*nb --rm_md + python make_notebooks.py --inplace --requirements=ISLP_labs/requirements.txt source/labs/Ch06-varselect-lab.ipynb + rm source/labs/Ch*md + + - name: Make docs + run: | + cd docs + make html + + # Store the output + - name: Upload docs + uses: actions/upload-artifact@v4 + with: + name: ISLP_docs + path: docs/build/html + retention-days: 5 + + deploy: + runs-on: ubuntu-latest + needs: build + + # Grant GITHUB_TOKEN the permissions required to make a Pages deployment + permissions: + pages: write # to deploy to Pages + id-token: write # to verify the deployment originates from an appropriate source + + environment: + name: github-pages + url: ${{steps.deployment.outputs.page_url}} + + steps: + - uses: actions/download-artifact@master + with: + name: ISLP_docs + path: . + - uses: actions/configure-pages@v4 + with: + node-version: 20.x + - uses: actions/upload-pages-artifact@v3 + with: + node-version: 20.x + path: . + - id: deployment + uses: actions/deploy-pages@main \ No newline at end of file diff --git a/.github/workflows/build_notebook.yml b/.github/workflows/build_notebook.yml new file mode 100644 index 0000000..dbf97e8 --- /dev/null +++ b/.github/workflows/build_notebook.yml @@ -0,0 +1,105 @@ +# This is a basic workflow to help you get started with Actions + +name: Build a notebook + +# Controls when the workflow will run +on: + workflow_dispatch: + inputs: + LABS: + description: 'Labs version' + required: true + default: 'v2' + type: string + ID: + description: 'Which lab to build' + required: true + default: '03' + type: string + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build-linux: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + cache: 'pip' + + # Install + - name: Install dependencies + run: | + pip install . + pip install jupyterlab + + # Runs a set of commands using the runners shell + - name: Build notebook + env: + LABS: ${{ inputs.LABS }} + ID: ${{ inputs.ID }} + run: | + git clone https://github.com/intro-stat-learning/ISLP_labs.git + cd ISLP_labs + git checkout $LABS + cp Ch*$ID*lab.ipynb .. + jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb + jupyter nbconvert --to html ../Ch*$ID*lab.ipynb + + # Store the output + - name: Upload labs + env: + ID: ${{ inputs.ID }} + uses: actions/upload-artifact@v3 + with: + name: ISLP_labs + path: Ch* + retention-days: 1 + + build-mac: + # The type of runner that the job will run on + runs-on: macos-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + cache: 'pip' + + # Install + - name: Install dependencies + run: | + pip install . + pip install jupyterlab + + # Runs a set of commands using the runners shell + - name: Build notebook + env: + LABS: ${{ inputs.LABS }} + ID: ${{ inputs.ID }} + run: | + git clone https://github.com/intro-stat-learning/ISLP_labs.git + cd ISLP_labs + git checkout $LABS + cp Ch*$ID*lab.ipynb .. + jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb + jupyter nbconvert --to html ../Ch*$ID*lab.ipynb + + # Store the output + - name: Upload labs + env: + ID: ${{ inputs.ID }} + uses: actions/upload-artifact@v3 + with: + name: ISLP_labs + path: Ch* + retention-days: 1 \ No newline at end of file diff --git a/.github/workflows/build_notebook_errors.yml b/.github/workflows/build_notebook_errors.yml new file mode 100644 index 0000000..d5fabee --- /dev/null +++ b/.github/workflows/build_notebook_errors.yml @@ -0,0 +1,104 @@ +# This is a basic workflow to help you get started with Actions + +name: Build a notebook (allow errors, capture result) + +# Controls when the workflow will run +on: + workflow_dispatch: + inputs: + LABS: + description: 'Labs version' + required: true + default: 'v2' + type: string + ID: + description: 'Which lab to build' + required: true + default: '02' + type: string + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + + build-linux: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + cache: 'pip' + + # Install + - name: Install dependencies + run: | + pip install . + + # Runs a set of commands using the runners shell + - name: Build notebook, allowing errors + env: + LABS: ${{ inputs.LABS }} + ID: ${{ inputs.ID }} + run: | + git clone https://github.com/intro-stat-learning/ISLP_labs.git + cd ISLP_labs + git checkout $LABS + cp Ch*$ID*lab.ipynb .. + jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb --allow-errors + jupyter nbconvert --to html ../Ch*$ID*lab.ipynb + + # Store the output + - name: Upload labs + env: + ID: ${{ inputs.ID }} + uses: actions/upload-artifact@v3 + with: + name: ISLP_labs + path: Ch* + retention-days: 1 + + build-mac: + # The type of runner that the job will run on + runs-on: macos-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + cache: 'pip' + + # Install + - name: Install dependencies + run: | + pip install . + + # Runs a set of commands using the runners shell + - name: Build notebook, allowing errors + env: + LABS: ${{ inputs.LABS }} + ID: ${{ inputs.ID }} + run: | + git clone https://github.com/intro-stat-learning/ISLP_labs.git + cd ISLP_labs + git checkout $LABS + cp Ch*$ID*lab.ipynb .. + jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb --allow-errors + jupyter nbconvert --to html ../Ch*$ID*lab.ipynb + + # Store the output + - name: Upload labs + env: + ID: ${{ inputs.ID }} + uses: actions/upload-artifact@v3 + with: + name: ISLP_labs + path: Ch* + retention-days: 1 + diff --git a/.github/workflows/build_save_labs.yml b/.github/workflows/build_save_labs.yml new file mode 100644 index 0000000..57ebf78 --- /dev/null +++ b/.github/workflows/build_save_labs.yml @@ -0,0 +1,104 @@ +# This is a basic workflow to help you get started with Actions + +name: Build + save notebooks (not 10,13) + +# Controls when the workflow will run +on: + workflow_dispatch: + inputs: + LABS: + description: 'Labs version' + required: true + default: 'v2' + type: string + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + cache: 'pip' + + # Install + - name: Install dependencies + run: | + pip install . + + # Runs a set of commands using the runners shell + - name: Build Ch02 notebook (allow errors) + env: + LABS: ${{ inputs.LABS }} + run: | + git clone https://github.com/intro-stat-learning/ISLP_labs.git + cd ISLP_labs + git checkout $LABS + rm Ch10* + rm Ch13* + jupyter nbconvert --execute --inplace --allow-errors Ch02*lab.ipynb + + - name: Build Ch03 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch03*lab.ipynb + + - name: Build Ch04 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch04*lab.ipynb + + - name: Build Ch05 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch05*lab.ipynb + + - name: Build Ch06 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch06*lab.ipynb + + - name: Build Ch07 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch07*lab.ipynb + + - name: Build Ch08 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch08*lab.ipynb + + - name: Build Ch09 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch09*lab.ipynb + + - name: Build Ch11 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch11*lab.ipynb + + - name: Build Ch12 notebook + run: | + cd ISLP_labs + jupyter nbconvert --execute --inplace Ch12*lab.ipynb + + - name: Build HTML + run: | + cd ISLP_labs + jupyter nbconvert --to html Ch*ipynb + + # Store the output + - name: Upload labs + uses: actions/upload-artifact@v3 + with: + name: ISLP_labs + path: Ch* + retention-days: 1 \ No newline at end of file diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml new file mode 100644 index 0000000..767a62e --- /dev/null +++ b/.github/workflows/build_test.yml @@ -0,0 +1,102 @@ +name: Build and test + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.12 + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + pip install . + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + timeout-minutes: 12 + run: | + pip install torchvision torchinfo + pip install pytest + pytest + + build-windows: + runs-on: windows-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.12 + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + pip install . + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + timeout-minutes: 12 + run: | + pip install torchvision torchinfo + pip install pytest + pytest + + build-mac: + runs-on: macos-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.12 + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + pip install . + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + timeout-minutes: 12 + run: | + pip install torchvision torchinfo + pip install pytest + pytest --ignore tests/deeplearning/test_hitters.py --ignore tests/deeplearning/test_mnist.py + + diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..891a60a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/ISLP_labs"] + path = docs/ISLP_labs + url = https://github.com/intro-stat-learning/ISLP_labs diff --git a/.readthedocs.yaml b/.readthedocs.yaml index aacca4b..44bfa25 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,15 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.11" + apt_packages: + - r-base + jobs: + pre_build: + - python docs/fix_and_clear_notebooks.py + +submodules: + include: all # Build documentation in the docs/ directory with Sphinx sphinx: @@ -22,8 +30,7 @@ sphinx: # Optionally declare the Python requirements required to build your docs python: install: - - requirements: requirements.txt + - requirements: docs/ISLP_labs/requirements.txt - requirements: docs/requirements.txt - - requirements: torch_requirements.txt - method: pip path: . diff --git a/ISLP/__init__.py b/ISLP/__init__.py index ae230d3..6cd1ee1 100644 --- a/ISLP/__init__.py +++ b/ISLP/__init__.py @@ -6,28 +6,74 @@ """ from os.path import join as pjoin +from importlib.resources import (as_file, + files) import pandas as pd, numpy as np -from pkg_resources import resource_filename +from sklearn.metrics import confusion_matrix as _confusion_matrix +from sklearn.metrics._classification import unique_labels # data originally saved via: [sm.datasets.get_rdataset(n, 'ISLR').data.to_csv('../ISLP/data/%s.csv' % n, index=False) for n in ['Carseats', 'College', 'Credit', 'Default', 'Hitters', 'Auto', 'OJ', 'Portfolio', 'Smarket', 'Wage', 'Weekly', 'Caravan']] +def _make_categorical(dataset): + unordered = _unordered.setdefault(dataset, []) + ordered = _ordered.setdefault(dataset, []) + with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename: + df = pd.read_csv(filename) + for col in unordered: + df[col] = pd.Categorical(df[col]) + for col in ordered: + df[col] = pd.Categorical(df[col], ordered=True) + if dataset in _index: + df = df.set_index(_index[dataset]) + return df + +_unordered = {'Hitters':['League', 'Division', 'NewLeague'], + 'Caravan':['Purchase'], + 'Carseats':['ShelveLoc', 'Urban', 'US'], + 'College':['Private'], + 'Publication':['mech'], + 'BrainCancer':['sex', 'diagnosis', 'loc', 'stereo'], + 'Wage':['maritl', 'race', 'region', 'jobclass', 'health', 'health_ins'], + 'Default':['default', 'student'], + 'Credit':['Gender', 'Student', 'Married', 'Ethnicity'], + 'OJ':['Purchase', 'Store7'], + 'Smarket':['Direction'], + 'Weekly':['Direction'] + } +_ordered = {'Wage':['education'], + } +_index = {'Auto':'name'} + +_datasets = sorted(list(_unordered.keys()) + + list(_ordered.keys()) + + ['NCI60', + 'Khan', + 'Bikeshare', + 'NYSE']) + def load_data(dataset): + if dataset == 'NCI60': - features = resource_filename('ISLP', pjoin('data', 'NCI60data.npy')) - X = np.load(features) - labels = resource_filename('ISLP', pjoin('data', 'NCI60labs.csv')) - Y = pd.read_csv(labels) + with as_file(files('ISLP').joinpath('data', 'NCI60data.npy')) as features: + X = np.load(features) + with as_file(files('ISLP').joinpath('data', 'NCI60labs.csv')) as labels: + Y = pd.read_csv(labels) return {'data':X, 'labels':Y} elif dataset == 'Khan': - xtest = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_xtest.csv'))) + with as_file(files('ISLP').joinpath('data', 'Khan_xtest.csv')) as xtest: + xtest = pd.read_csv(xtest) xtest = xtest.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)])) - ytest = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_ytest.csv'))) + with as_file(files('ISLP').joinpath('data', 'Khan_ytest.csv')) as ytest: + ytest = pd.read_csv(ytest) ytest = ytest.rename(columns={'x':'Y'}) ytest = ytest['Y'] - xtrain = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_xtrain.csv'))) - xtrain = xtrain.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)])) - ytrain = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_ytrain.csv'))) + with as_file(files('ISLP').joinpath('data', 'Khan_xtrain.csv')) as xtrain: + xtrain = pd.read_csv(xtrain) + xtrain = xtrain.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)])) + + with as_file(files('ISLP').joinpath('data', 'Khan_ytrain.csv')) as ytrain: + ytrain = pd.read_csv(ytrain) ytrain = ytrain.rename(columns={'x':'Y'}) ytrain = ytrain['Y'] @@ -35,35 +81,10 @@ def load_data(dataset): 'xtrain':xtrain, 'ytest':ytest, 'ytrain':ytrain} - elif dataset == 'Hitters': - filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset)) - df = pd.read_csv(filename) - for col in ['League', 'Division', 'NewLeague']: - df[col] = pd.Categorical(df[col]) - return df - elif dataset == 'Carseats': - filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset)) - df = pd.read_csv(filename) - for col in ['ShelveLoc', 'Urban', 'US']: - df[col] = pd.Categorical(df[col]) - return df - elif dataset == 'NYSE': - filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset)) - df = pd.read_csv(filename).set_index('date') - return df - elif dataset == 'Publication': - df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Publication.csv'))) - for col in ['mech']: - df[col] = pd.Categorical(df[col]) - return df - elif dataset == 'BrainCancer': - df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'BrainCancer.csv'))) - for col in ['sex', 'diagnosis', 'loc', 'stereo']: - df[col] = pd.Categorical(df[col]) - return df + elif dataset == 'Bikeshare': - filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset)) - df = pd.read_csv(filename) + with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename: + df = pd.read_csv(filename) df['weathersit'] = pd.Categorical(df['weathersit'], ordered=False) # setting order to avoid alphabetical df['mnth'] = pd.Categorical(df['mnth'], @@ -78,26 +99,60 @@ def load_data(dataset): ordered=False, categories=range(24)) return df - elif dataset == 'Wage': - df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Wage.csv'))) - df['education'] = pd.Categorical(df['education'], ordered=True) - return df + elif dataset == 'NYSE': + with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename: + df = pd.read_csv(filename) + # setting order to avoid alphabetical + df['day_of_week'] = pd.Categorical(df['day_of_week'], + ordered=False, + categories=['mon', + 'tues', + 'wed', + 'thur', + 'fri']) + return df.set_index('date') else: - filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset)) - return pd.read_csv(filename) + return _make_categorical(dataset) +load_data.__doc__ = f""" +Load dataset from ISLP package. -from sklearn.metrics import confusion_matrix as _confusion_matrix +Choices are: {_datasets} + +Parameters +---------- + +dataset: str + +Returns +------- + +data: array-like or dict + Either a `pd.DataFrame` representing the dataset or a dictionary + containing different parts of the dataset. + +""" def confusion_table(predicted_labels, - true_labels): + true_labels, + labels=None): """ Return a data frame version of confusion matrix with rows given by predicted label and columns the truth. + + Parameters + ---------- + + predicted_labels: array-like + These will form rows of confusion matrix. + + true_labels: array-like + These will form columns of confusion matrix. """ - labels = sorted(np.unique(list(true_labels) + - list(predicted_labels))) + if labels is None: + labels = unique_labels(true_labels, + predicted_labels) C = _confusion_matrix(true_labels, predicted_labels, labels=labels) @@ -109,3 +164,4 @@ def confusion_table(predicted_labels, from . import _version __version__ = _version.get_versions()['version'] + diff --git a/ISLP/_version.py b/ISLP/_version.py index 9b01ea2..c2d7406 100644 --- a/ISLP/_version.py +++ b/ISLP/_version.py @@ -5,8 +5,9 @@ # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" @@ -15,10 +16,11 @@ import re import subprocess import sys -from typing import Callable, Dict +from typing import Any, Callable, Dict, List, Optional, Tuple +import functools -def get_keywords(): +def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must @@ -34,8 +36,15 @@ def get_keywords(): class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool -def get_config(): + +def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py @@ -57,9 +66,9 @@ class NotThisMethod(Exception): HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} @@ -68,11 +77,25 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + for command in commands: try: dispcmd = str([command] + args) @@ -80,10 +103,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr - else None)) + else None), **popen_kwargs) break - except OSError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -103,7 +125,11 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, process.returncode -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -128,13 +154,13 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: @@ -156,7 +182,11 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") @@ -220,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): +def git_pieces_from_vcs( + tag_prefix: str, + root: str, + verbose: bool, + runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -228,13 +263,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): version string, meaning we're inside a checked out source tree. """ GITS = ["git"] - TAG_PREFIX_REGEX = "*" if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - TAG_PREFIX_REGEX = r"\*" + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -242,11 +282,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", - "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)], - cwd=root) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -256,7 +295,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None @@ -335,8 +374,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() @@ -348,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -380,7 +419,7 @@ def render_pep440(pieces): return rendered -def render_pep440_branch(pieces): +def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards @@ -410,7 +449,7 @@ def render_pep440_branch(pieces): return rendered -def pep440_split_post(ver): +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the @@ -420,7 +459,7 @@ def pep440_split_post(ver): return vc[0], int(vc[1] or 0) if len(vc) == 2 else None -def render_pep440_pre(pieces): +def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: @@ -432,7 +471,7 @@ def render_pep440_pre(pieces): tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -444,7 +483,7 @@ def render_pep440_pre(pieces): return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -471,7 +510,7 @@ def render_pep440_post(pieces): return rendered -def render_pep440_post_branch(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. @@ -500,7 +539,7 @@ def render_pep440_post_branch(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. @@ -522,7 +561,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -542,7 +581,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -562,7 +601,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", @@ -598,7 +637,7 @@ def render(pieces, style): "date": pieces.get("date")} -def get_versions(): +def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some diff --git a/ISLP/bart/bart.py b/ISLP/bart/bart.py index 2c33aba..3c933ca 100644 --- a/ISLP/bart/bart.py +++ b/ISLP/bart/bart.py @@ -101,11 +101,11 @@ def predict(self, check_is_fitted(self) nsample = len(self.trees_sample_) - output = np.zeros(X.shape[0], np.float) + output = np.zeros(X.shape[0], float) for trees in self.trees_sample_: for tree in trees: - tree_fit = np.array([tree.predict_out_of_sample(x) for x in X]) + tree_fit = np.array([tree.predict_out_of_sample(x) for x in np.asarray(X)]) output += tree_fit output = output / nsample return self._inverse(output) @@ -118,11 +118,11 @@ def staged_predict(self, trees_sample_ = self.trees_sample_[start_idx:] nsample = len(trees_sample_) - output = np.zeros((nsample, X.shape[0]), np.float) + output = np.zeros((nsample, X.shape[0]), float) for nstep, trees in enumerate(trees_sample_): for tree in trees: - tree_fit = np.array([tree.predict_out_of_sample(x) for x in X]) + tree_fit = np.array([tree.predict_out_of_sample(x) for x in np.asarray(X)]) output[nstep] += tree_fit output = np.cumsum(output, 0) / (np.arange(nsample) + 1)[:,None] @@ -141,7 +141,7 @@ def fit(self, if self.n_jobs <= 0: n_jobs = 1 - random_idx = random_state.randint(0,2**32-1,size=(n_jobs,)) + random_idx = random_state.randint(0,2**30-1,size=(n_jobs,)) # 2**31-1 should be OK for int32 parallel = Parallel(n_jobs=len(random_idx)) diff --git a/ISLP/bart/likelihood.py b/ISLP/bart/likelihood.py index 28f341d..cfa3ce6 100644 --- a/ISLP/bart/likelihood.py +++ b/ISLP/bart/likelihood.py @@ -82,7 +82,7 @@ def marginal_loglikelihood(response, if not incremental: if responsesq_sum is None: responsesq_sum = (response**2).sum() - response_moments = (n, response_sum, responseseq_sum) + response_moments = (n, response_sum, responsesq_sum) logL -= n * 0.5 * np.log(sigmasq) logL -= 0.5 * responsesq_sum / sigmasq diff --git a/ISLP/bart/tree.py b/ISLP/bart/tree.py index 8726929..49b4789 100644 --- a/ISLP/bart/tree.py +++ b/ISLP/bart/tree.py @@ -96,7 +96,7 @@ def predict_output(self): current_node = self.get_node(node_index) output[current_node.idx_data_points] = current_node.value - return output.astype(np.float) + return output.astype(float) def predict_out_of_sample(self, X): """ diff --git a/ISLP/info.py b/ISLP/info.py deleted file mode 100644 index 3a1fecd..0000000 --- a/ISLP/info.py +++ /dev/null @@ -1,78 +0,0 @@ -""" This file contains defines parameters for regreg that we use to fill -settings in setup.py, the regreg top-level docstring, and for building the docs. -In setup.py in particular, we exec this file, so it cannot import regreg -""" - -# regreg version information. An empty _version_extra corresponds to a -# full release. '.dev' as a _version_extra string means this is a development -# version -_version_major = 0 -_version_minor = 2 -_version_micro = 0 -_version_extra = '' - -# Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z" -__version__ = "%s.%s.%s%s" % (_version_major, - _version_minor, - _version_micro, - _version_extra) - -CLASSIFIERS = ["Development Status :: 3 - Alpha", - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Topic :: Scientific/Engineering"] - -description = 'Testing a fixed value of lambda' - -# Note: this long_description is actually a copy/paste from the top-level -# README.txt, so that it shows up nicely on PyPI. So please remember to edit -# it only in one place and sync it correctly. -long_description = \ -""" -============ -Fixed lambda -============ - -This mini-package contains a module to perform -a fixed lambda test for the LASSO. -""" - -# versions -NUMPY_MIN_VERSION='1.7.1' -SCIPY_MIN_VERSION = '0.9' -PANDAS_MIN_VERSION = "0.20" -SKLEARN_MIN_VERSION = '1.0' -STATSMODELS_MIN_VERSION = '0.13' -MATPLOTLIB_MIN_VERSION = '3.3.3' - -NAME = 'ISLP' -MAINTAINER = "Jonathan Taylor" -MAINTAINER_EMAIL = "" -DESCRIPTION = description -LONG_DESCRIPTION = long_description -URL = "http://github.org/jonathan.taylor/ISLP" -DOWNLOAD_URL = "" -LICENSE = "BSD license" -CLASSIFIERS = CLASSIFIERS -AUTHOR = "ISLP authors" -AUTHOR_EMAIL = "" -PLATFORMS = "OS Independent" -MAJOR = _version_major -MINOR = _version_minor -MICRO = _version_micro -ISRELEASE = _version_extra == '' -VERSION = __version__ -STATUS = 'alpha' -PROVIDES = [] -REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION, - "scipy (>=%s)" % SCIPY_MIN_VERSION, - "statsmodels (>=%s)" % STATSMODELS_MIN_VERSION, - "pandas (>=%s)" % PANDAS_MIN_VERSION, - "sklearn (>=%s)" % SKLEARN_MIN_VERSION, - "lifelines", - "joblib", - "pygam" - ] diff --git a/ISLP/models/__init__.py b/ISLP/models/__init__.py index bf9cd55..cff02f8 100644 --- a/ISLP/models/__init__.py +++ b/ISLP/models/__init__.py @@ -4,14 +4,15 @@ """ import numpy as np, pandas as pd +from io import StringIO from .model_spec import (ModelSpec, Column, - Variable, + Feature, poly, ns, bs, - derived_variable, + derived_feature, pca, contrast, build_columns) @@ -24,13 +25,14 @@ sklearn_selection_path) def summarize(results, - conf_int=False): + conf_int=False, + level=None): """ Take a fit statsmodels and summarize it by returning the usual coefficient estimates, their standard errors, the usual test statistics and P-values as well as - (optionally) 95% confidence intervals. + (optionally) confidence intervals. Based on: @@ -45,8 +47,12 @@ def summarize(results, Include 95% confidence intervals? """ - tab = results.summary().tables[1] - results_table = pd.read_html(tab.as_html(), + if level is not None: + conf_int = True + if level is None: + level = 0.95 + tab = results.summary(alpha=1-level).tables[1] + results_table = pd.read_html(StringIO(tab.as_html()), index_col=0, header=0)[0] if not conf_int: @@ -57,12 +63,4 @@ def summarize(results, return results_table[results_table.columns[:-2]] return results_table -# def poly(X, degree): -# """ -# Create columns of design matrix -# for orthogonal polynomial for a given series X -# """ - -# result = Poly(degree=degree).fit_transform(X) - diff --git a/ISLP/models/columns.py b/ISLP/models/columns.py index c15ace2..7ea6adb 100644 --- a/ISLP/models/columns.py +++ b/ISLP/models/columns.py @@ -9,7 +9,6 @@ from sklearn.utils.validation import check_is_fitted from sklearn.exceptions import NotFittedError - class Column(NamedTuple): """ @@ -52,7 +51,7 @@ def get_columns(self, X, fit=False): Column names """ - cols = _get_column(self.idx, X, ndarray=False) + cols = _get_column(self.idx, X) if fit: self.fit_encoder(X) @@ -88,7 +87,7 @@ def fit_encoder(self, X): ------- None """ - cols = _get_column(self.idx, X, ndarray=False) + cols = _get_column(self.idx, X) if self.encoder is not None: try: check_is_fitted(self.encoder) @@ -102,41 +101,30 @@ def fit_encoder(self, X): def _get_column(idx, X, - twodim=False, - loc=True, - ndarray=True): + loc=True): """ - Extract column `idx` from `X`, - optionally making it two-dimensional - as many sklearn encoders assume - two-dimensional input + Extract column `idx` from `X` + as a two-dimensional ndarray or a pd.DataFrame """ if isinstance(X, np.ndarray): - col = X[:, idx] + col = X[:, [idx]] elif hasattr(X, 'loc'): if loc: - col = X.loc[:, idx] + col = X.loc[:, [idx]] else: # use iloc instead - col = X.iloc[:, idx] + col = X.iloc[:, [idx]] else: raise ValueError('expecting an ndarray or a ' + '"loc/iloc" methods, got %s' % str(X)) - if ndarray: - if twodim and np.asarray(col).ndim == 1: - return np.asarray(col).reshape((-1, 1)) - return np.asarray(col) - else: - return col + + return col def _get_column_info(X, columns, is_categorical, is_ordinal, - default_encoders={ - 'ordinal': OrdinalEncoder(), - 'categorical': OneHotEncoder() - } + categorical_encoders={} ): @@ -158,13 +146,19 @@ def _get_column_info(X, name = str(col) if is_categorical[i]: if is_ordinal[i]: - Xcol = _get_column(col, X, twodim=True) - encoder = clone(default_encoders['ordinal']) + Xcol = _get_column(col, X) + if col not in categorical_encoders: + encoder = clone(categorical_encoders['ordinal']) + else: + encoder = categorical_encoders[col] encoder.fit(Xcol) columns = ['{0}'.format(col)] else: - Xcol = _get_column(col, X, twodim=True, ndarray=True) - encoder = clone(default_encoders['categorical']) + Xcol = _get_column(col, X) + if col not in categorical_encoders: + encoder = clone(categorical_encoders['categorical']) + else: + encoder = categorical_encoders[col] cols = encoder.fit_transform(Xcol) if hasattr(encoder, 'columns_'): columns_ = encoder.columns_ @@ -179,7 +173,7 @@ def _get_column_info(X, tuple(columns), encoder) else: - Xcol = _get_column(col, X, twodim=True) + Xcol = _get_column(col, X) column_info[col] = Column(col, name, columns=(name,)) @@ -189,7 +183,6 @@ def _get_column_info(X, # https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py # max_bins is ignored - def _check_categories(categorical_features, X): """Check and validate categorical features in X diff --git a/ISLP/models/generic_selector.py b/ISLP/models/generic_selector.py index b0261e9..7c9329e 100644 --- a/ISLP/models/generic_selector.py +++ b/ISLP/models/generic_selector.py @@ -28,7 +28,10 @@ import scipy as sp from sklearn.metrics import get_scorer -from sklearn.base import (clone, MetaEstimatorMixin) +from sklearn.base import (clone, + MetaEstimatorMixin, + is_classifier, + is_regressor) from sklearn.model_selection import cross_val_score from joblib import Parallel, delayed @@ -149,13 +152,13 @@ def __init__(self, self.scoring = scoring if scoring is None: - if self.est_._estimator_type == 'classifier': + if is_classifier(self.est_): scoring = 'accuracy' - elif self.est_._estimator_type == 'regressor': + elif is_regressor(self.est_): scoring = 'r2' else: - raise AttributeError('Estimator must ' - 'be a Classifier or Regressor.') + scoring = None + if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: @@ -486,6 +489,9 @@ def _calc_score(estimator, pre_dispatch='2*n_jobs', **fit_params): + if scorer is None: + scorer = lambda estimator, X, y: estimator.score(X, y) + X_state = build_submodel(X, state) if cv: diff --git a/ISLP/models/model_spec.py b/ISLP/models/model_spec.py index c5be3f9..d970bb7 100644 --- a/ISLP/models/model_spec.py +++ b/ISLP/models/model_spec.py @@ -35,7 +35,7 @@ DOCACHE = False -class Variable(NamedTuple): +class Feature(NamedTuple): """ An element in a model matrix that will build @@ -49,29 +49,64 @@ class Variable(NamedTuple): pure_columns: bool=False override_encoder_colnames: bool=False + #### contrast specific code class Contrast(TransformerMixin, BaseEstimator): - """ - Contrast encoding for categorical variables. - """ def __init__(self, method='drop', drop_level=None): + """ + Contrast encoding for categorical variables. + + Parameters + ---------- + method : ['drop', 'sum', None, callable] + If 'drop', then a column of the one-hot + encoding will be dropped. If 'sum', then the sum of + coefficients is constrained to sum to 1. + If `None`, the full one-hot encoding is returned. + Finally, if callable, then it should take the number of + levels of the category as a single argument and return + an appropriate contrast of the full one-hot encoding. + + drop_level : str (optional) + If not None, this level of the category + will be dropped if `method=='drop'`. + + """ self.method = method self.drop_level = drop_level - def fit(self, X): + def fit(self, X, y=None): + + """ + Construct contrast of categorical variable + for use in building a design matrix. + + Parameters + ---------- + X : array-like + X on which model matrix will be evaluated. + If a :py:class:`pd.DataFrame` or :py:class:`pd.Series`, variables that are of + categorical dtype will be treated as categorical. + + Returns + ------- + F : array-like + Columns of design matrix implied by the + categorical variable. + + """ Xa = np.asarray(X).reshape((-1,1)) self.encoder_ = OneHotEncoder(drop=None, - sparse=False).fit(Xa) + sparse_output=False).fit(Xa) cats = self.encoder_.categories_[0] column_names = [str(n) for n in cats] - if isinstance(X, pd.DataFrame): # expecting a column, we take .iloc[:,0] X = X.iloc[:,0] @@ -98,7 +133,7 @@ def fit(self, X): if self.method == 'drop': self.columns_ = [column_names[j] for j in colmap] self.contrast_matrix_ = np.identity(len(cats)) - keep = np.ones(len(cats), np.bool) + keep = np.ones(len(cats), bool) keep[drop_idx] = 0 self.contrast_matrix_ = self.contrast_matrix_[:,keep] self.contrast_matrix_ = self.contrast_matrix_[:,colmap] @@ -119,6 +154,7 @@ def fit(self, X): raise ValueError('method must be one of ["drop", "sum", None] or a callable' + 'that returns a contrast matrix and column names given the number' + ' of levels') + return self def transform(self, X): @@ -136,22 +172,23 @@ def transform(self, X): class ModelSpec(TransformerMixin, BaseEstimator): - ''' - - Parameters + '''Parameters ---------- terms : sequence (optional) + Sequence of sets whose elements are columns of *X* when fit. For :py:class:`pd.DataFrame` these can be column names. intercept : bool (optional) + Include a column for intercept? categorical_features : array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None. + Indicates the categorical features. Will be ignored if *X* is a :py:class:`pd.DataFrame` or :py:class:`pd.Series`. @@ -160,25 +197,31 @@ class ModelSpec(TransformerMixin, BaseEstimator): - integer array-like : integer indices indicating categorical features. - default_encoders : dict - Dictionary whose keys are elements of *terms* and values - are transforms to be applied to the associate columns in the model matrix - by running the *fit_transform* method when *fit* is called and overwriting - these values in the dictionary. + categorical_encoders : dict + + Dictionary whose keys are elements of *terms* that represent + **categorical variables**. Its values are transforms to be + applied to the associate columns in the model matrix by + running the *fit_transform* method when *fit* is called and + overwriting these values in the dictionary. + ''' def __init__(self, terms=[], intercept=True, categorical_features=None, - default_encoders={'categorical': Contrast(method='drop'), - 'ordinal': OrdinalEncoder()} + categorical_encoders={} ): self.intercept = intercept self.terms = terms self.categorical_features = categorical_features - self.default_encoders = default_encoders + + self.categorical_encoders = categorical_encoders + self.categorical_encoders_ = {'ordinal': OrdinalEncoder(), + 'categorical': Contrast(method='drop')} + self.categorical_encoders_.update(**categorical_encoders) def fit(self, X, y=None): @@ -203,7 +246,7 @@ def fit(self, X, y=None): X) self.columns_ = X.columns if self.is_categorical_ is None: - self.is_categorical_ = np.zeros(X.shape[1], np.bool) + self.is_categorical_ = np.zeros(X.shape[1], bool) self.is_ordinal_ = pd.Series(self.is_ordinal_, index=self.columns_) self.is_categorical_ = pd.Series(self.is_categorical_, @@ -214,32 +257,33 @@ def fit(self, X, y=None): self.known_categories_) = _check_categories(categorical_features, X) if self.is_categorical_ is None: - self.is_categorical_ = np.zeros(X.shape[1], np.bool) + self.is_categorical_ = np.zeros(X.shape[1], bool) self.is_ordinal_ = np.zeros(self.is_categorical_.shape, - np.bool) + bool) self.columns_ = np.arange(X.shape[1]) - self.variables_ = {} + self.features_ = {} self.encoders_ = {} self.column_info_ = _get_column_info(X, self.columns_, - self.is_categorical_, - self.is_ordinal_, - default_encoders=self.default_encoders) - # include each column as a Variable + np.asarray(self.is_categorical_), + np.asarray(self.is_ordinal_), + categorical_encoders=self.categorical_encoders_) + + # include each column as a Feature # so that their columns are built if needed for col_ in self.columns_: - self.variables_[col_] = Variable((col_,), str(col_), None, pure_columns=True) + self.features_[col_] = Feature((col_,), str(col_), None, pure_columns=True) - # find possible interactions and other variables + # find possible interactions and other features tmp_cache = {} for term in self.terms: - if isinstance(term, Variable): - self.variables_[term] = term + if isinstance(term, Feature): + self.features_[term] = term build_columns(self.column_info_, X, term, @@ -247,18 +291,18 @@ def fit(self, X, y=None): col_cache=tmp_cache, fit=True) # these encoders won't have been fit yet for var in term.variables: - if var not in self.variables_ and isinstance(var, Variable): - self.variables_[var] = var + if var not in self.features_ and isinstance(var, Feature): + self.features_[var] = var elif term not in self.column_info_: - # a tuple of variables represents an interaction + # a tuple of features represents an interaction if type(term) == type((1,)): names = [] column_map = {} column_names = {} idx = 0 for var in term: - if var in self.variables_: - var = self.variables_[var] + if var in self.features_: + var = self.features_[var] cols, cur_names = build_columns(self.column_info_, X, var, @@ -270,17 +314,17 @@ def fit(self, X, y=None): idx += cols.shape[1] names.append(var.name) encoder_ = Interaction(names, column_map, column_names) - self.variables_[term] = Variable(term, ':'.join(n for n in names), encoder_) + self.features_[term] = Feature(term, ':'.join(n for n in names), encoder_) elif isinstance(term, Column): - self.variables_[term] = Variable((term,), term.name, None, pure_columns=True) + self.features_[term] = Feature((term,), term.name, None, pure_columns=True) else: - raise ValueError('each element in a term should be a Variable, Column or identify a column') + raise ValueError('each element in a term should be a Feature, Column or identify a column') # build the mapping of terms to columns and column names self.column_names_ = {} self.column_map_ = {} - self.terms_ = [self.variables_[t] for t in self.terms] + self.terms_ = [self.features_[t] for t in self.terms] idx = 0 if self.intercept: @@ -310,64 +354,48 @@ def transform(self, X, y=None): Ignored. This parameter exists only for compatibility with :py:class:`sklearn.pipeline.Pipeline`. """ - return self.build_submodel(X, self.terms_) + check_is_fitted(self) + return build_model(self.column_info_, + X, + self.terms_, + intercept=self.intercept, + encoders=self.encoders_) # ModelSpec specific methods - def build_submodel(self, X, terms): + @property + def names(self, help='Name for each term in model specification.'): + names = [] + if self.intercept: + names = ['intercept'] + return names + [t.name for t in self.terms_] + + def build_submodel(self, + X, + terms): """ - Construct design matrix on a - sequence of terms and X after - fitting. + Build design on X after fitting. Parameters ---------- X : array-like - X on which model matrix will be evaluated. + X on which columns are evaluated. + + terms : [Feature] + Sequence of features Returns ------- - df : np.ndarray or pd.DataFrame - Design matrix. + D : array-like + Design matrix created with `terms` """ - check_is_fitted(self) - - dfs = [] - - col_cache = {} # avoid recomputing the same columns - - if self.intercept: - df = pd.DataFrame({'intercept':np.ones(X.shape[0])}) - if isinstance(X, (pd.Series, pd.DataFrame)): - df.index = X.index - dfs.append(df) - - for term_ in terms: - term_df = build_columns(self.column_info_, - X, - term_, - col_cache=col_cache, - encoders=self.encoders_, - fit=False)[0] - dfs.append(term_df) - - if len(dfs): - if isinstance(X, (pd.Series, pd.DataFrame)): - df = pd.concat(dfs, axis=1) - df.index = X.index - return df - else: - return np.column_stack(dfs) - else: # return a 0 design - zero = np.zeros(X.shape[0]) - if isinstance(X, (pd.Series, pd.DataFrame)): - df = pd.DataFrame({'zero': zero}) - df.index = X.index - return df - else: - return zero + return build_model(self.column_info_, + X, + terms, + intercept=self.intercept, + encoders=self.encoders_) def build_sequence(self, X, @@ -375,6 +403,21 @@ def build_sequence(self, """ Build implied sequence of submodels based on successively including more terms. + + Parameters + ---------- + X : array-like + X on which columns are evaluated. + + anova_type: str + One of "sequential" or "drop". + + Returns + ------- + + models : generator + Generator for sequence of models for ANOVA. + """ check_is_fitted(self) @@ -427,8 +470,11 @@ def fit_encoder(encoders, var, X): Parameters ---------- - var : Variable - Variable whose encoder will be fit. + encoders : dict + Dictionary of encoders for each feature. + + var : Feature + Feature whose encoder will be fit. X : array-like X on which encoder will be fit. @@ -440,7 +486,7 @@ def fit_encoder(encoders, var, X): def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): """ - Build columns for a Variable from X. + Build columns for a Feature from X. Parameters ---------- @@ -452,10 +498,13 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): X : array-like X on which columns are evaluated. - var : Variable - Variable whose columns will be built, typically a key in `column_info`. + var : Feature + Feature whose columns will be built, typically a key in `column_info`. - col_cache: + encoders : dict + Dict that stores encoder of each Feature. + + col_cache: dict Dict where columns will be stored -- if `var.name` in `col_cache` then just returns those columns. @@ -480,7 +529,7 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): cols, name = col_cache[joblib_hash([var, X])] else: cols, names = var.get_columns(X, fit=fit) - elif isinstance(var, Variable): + elif isinstance(var, Feature): cols = [] names = [] for v in var.variables: @@ -495,16 +544,18 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): cols = np.column_stack(cols) if len(names) != cols.shape[1]: names = ['{0}[{1}]'.format(var.name, j) for j in range(cols.shape[1])] - if var.encoder: + df_cols = pd.DataFrame(np.asarray(cols), + columns=names) try: check_is_fitted(var.encoder) if fit and var not in encoders: raise ValueError('encoder has already been fit previously') except NotFittedError as e: if fit: - fit_encoder(var, pd.DataFrame(np.asarray(cols), - columns=names)) + fit_encoder(encoders, + var, + df_cols) # known issue with Pipeline # https://github.com/scikit-learn/scikit-learn/issues/18648 elif isinstance(var.encoder, Pipeline): @@ -514,9 +565,9 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): except Exception as e: # was not the NotFitted raise ValueError(e) if var.use_transform: - cols = var.encoder.transform(cols) + cols = var.encoder.transform(df_cols) else: - cols = var.encoder.predict(cols) + cols = var.encoder.predict(df_cols) if hasattr(var.encoder, 'columns_') and not var.override_encoder_colnames: names = var.encoder.columns_ else: @@ -527,7 +578,7 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): else: - raise ValueError('expecting either a column or a Variable') + raise ValueError('expecting either a column or a Feature') val = pd.DataFrame(np.asarray(cols), columns=names) if isinstance(X, (pd.DataFrame, pd.Series)): @@ -537,16 +588,88 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False): col_cache[joblib_hash([var.name, X])] = (val, names) return val, names +def build_model(column_info, + X, + terms, + intercept=True, + encoders={}): + + """ + Construct design matrix on a + sequence of terms and X after + fitting. + + Parameters + ---------- + column_info: dict + Dictionary with values specifying sets of columns to + be concatenated into a design matrix. + + X : array-like + X on which columns are evaluated. + + terms : [Feature] + Sequence of features + + encoders : dict + Dict that stores encoder of each Feature. -def derived_variable(variables, encoder=None, name=None, use_transform=True): + Returns + ------- + df : np.ndarray or pd.DataFrame + Design matrix. """ - Create a Variable, optionally + + dfs = [] + + col_cache = {} # avoid recomputing the same columns + + if intercept: + df = pd.DataFrame({'intercept':np.ones(X.shape[0])}) + if isinstance(X, (pd.Series, pd.DataFrame)): + df.index = X.index + dfs.append(df) + + for term_ in terms: + term_df = build_columns(column_info, + X, + term_, + col_cache=col_cache, + encoders=encoders, + fit=False)[0] + dfs.append(term_df) + + if len(dfs): + if isinstance(X, (pd.Series, pd.DataFrame)): + df = pd.concat(dfs, axis='columns') + df.index = X.index + else: + return np.column_stack(dfs).astype(float) + else: # return a 0 design + zero = np.zeros(X.shape[0]) + if isinstance(X, (pd.Series, pd.DataFrame)): + df = pd.DataFrame({'zero': zero}) + df.index = X.index + else: + return zero + + # if we reach here, we will be returning a DataFrame + # make sure all columns are floats + + for i, col in enumerate(df.columns): + if df.iloc[:,i].dtype == bool: + df[col] = df.iloc[:,i].astype(float) + return df + +def derived_feature(variables, encoder=None, name=None, use_transform=True): + """ + Create a Feature, optionally applying an encoder to the stacked columns. Parameters ---------- - variables : [column identifier, Column, Variable] + variables : [column identifier, Column, Feature] Variables to apply transform to. Could be column identifiers or variables: all columns will be stacked before encoding. @@ -560,12 +683,12 @@ def derived_variable(variables, encoder=None, name=None, use_transform=True): Returns ------- - var : Variable + var : Feature """ if name is None: name = str(encoder) - var = Variable(tuple([v for v in variables]), + var = Feature(tuple([v for v in variables]), name, encoder, use_transform=use_transform, @@ -590,7 +713,7 @@ def contrast(col, Returns ------- - var : Variable + var : Feature """ @@ -606,7 +729,7 @@ def contrast(col, is_categorical=True, encoder=encoder) -def ordinal(col, *args, **kwargs): +def ordinal(col, name=None, *args, **kwargs): """ Create ordinal encoding of categorical feature. @@ -618,7 +741,7 @@ def ordinal(col, *args, **kwargs): Returns ------- - var : Variable + var : Feature """ @@ -637,7 +760,7 @@ def ordinal(col, *args, **kwargs): name = f'{shortname}({name})' - return derived_variable([col], + return derived_feature([col], name=name, encoder=encoder) @@ -648,7 +771,7 @@ def poly(col, name=None): """ - Create a polynomial Variable + Create a polynomial Feature for a given column. Additional `args` and `kwargs` @@ -676,7 +799,7 @@ def poly(col, Returns ------- - var : Variable + var : Feature """ shortname, klass = 'poly', Poly encoder = klass(degree=degree, @@ -701,13 +824,13 @@ def poly(col, name = f'{shortname}({name})' - return derived_variable([col], + return derived_feature([col], name=name, encoder=encoder) def ns(col, intercept=False, name=None, **spline_args): """ - Create a natural spline Variable + Create a natural spline Feature for a given column. Additional *spline_args* @@ -727,7 +850,7 @@ def ns(col, intercept=False, name=None, **spline_args): Returns ------- - var : Variable + var : Feature """ shortname, klass = 'ns', NaturalSpline @@ -744,13 +867,13 @@ def ns(col, intercept=False, name=None, **spline_args): name = f'{shortname}({name})' encoder = klass(intercept=intercept, **spline_args) - return derived_variable([col], + return derived_feature([col], name=name, encoder=encoder) def bs(col, intercept=False, name=None, **spline_args): """ - Create a B-spline Variable + Create a B-spline Feature for a given column. Additional args and *spline_args* @@ -771,7 +894,7 @@ def bs(col, intercept=False, name=None, **spline_args): Returns ------- - var : Variable + var : Feature """ shortname, klass = 'bs', BSpline @@ -788,7 +911,7 @@ def bs(col, intercept=False, name=None, **spline_args): name = f'{shortname}({name})' encoder = klass(intercept=intercept, **spline_args) - return derived_variable([col], + return derived_feature([col], name=name, encoder=encoder) @@ -803,13 +926,13 @@ def pca(variables, name, scale=False, **pca_args): Parameters ---------- - variables : [column identifier, Column or Variable] + variables : [column identifier, Column or Feature] Sequence whose columns will be encoded by PCA. Returns ------- - var : Variable + var : Feature """ shortname, klass = 'pca', PCA @@ -824,52 +947,10 @@ def pca(variables, name, scale=False, **pca_args): if _args: name = ', '.join([name, _args]) - return derived_variable(variables, + return derived_feature(variables, name=f'{shortname}({name})', encoder=encoder) -# def clusterer(variables, name, transform, scale=False): -# """ -# Create PCA encoding of features -# from a sequence of variables. - -# Additional `args` and `kwargs` -# are passed to `PCA`. - -# Parameters -# ---------- - -# variables : [column identifier, Column or Variable] -# Sequence whose columns will be encoded by PCA. - -# name: str -# name for the Variable - -# transform: Transformer -# A transform with a `predict` method. - -# Returns -# ------- - -# var : Variable - -# """ - -# if scale: -# scaler = StandardScaler(with_mean=True, -# with_std=True) -# encoder = make_pipeline(scaler, transform) -# else: -# encoder = transform - -# intermed = Variable((derived_variable(*variables, -# name='cluster_intermed', -# encoder=encoder, -# use_transform=False),), -# name=f'Cat({encoder}({name}))', -# encoder=Contrast(method='drop')) - -# return intermed def _argstring(*args, **kwargs): _args = ', '.join([str(a) for a in args]) diff --git a/ISLP/models/sklearn_wrap.py b/ISLP/models/sklearn_wrap.py index 123130b..121da75 100644 --- a/ISLP/models/sklearn_wrap.py +++ b/ISLP/models/sklearn_wrap.py @@ -49,7 +49,17 @@ def __init__(self, self.model_type = model_type self.model_spec = model_spec self.model_args = model_args - + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + if self.model_type == sm.OLS: + tags.estimator_type = 'regressor' + elif (issubclass(self.model_type, sm.GLM) and + 'family' in self.model_args and + isinstance(self.model_args.get('family', None), sm.families.Binomial)): + tags.estimator_type = 'classifier' + return tags + def fit(self, X, y): """ Fit a statsmodel model @@ -171,6 +181,9 @@ def __init__(self, self.cv = cv self.scoring = scoring + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + return tags def fit(self, X, y): """ diff --git a/ISLP/models/strategy.py b/ISLP/models/strategy.py index 028ac94..f237db3 100644 --- a/ISLP/models/strategy.py +++ b/ISLP/models/strategy.py @@ -74,9 +74,9 @@ def __init__(self, Minumum number of terms to select max_terms: int (default: 0) Maximum number of terms to select - lower_terms: [Variable] + lower_terms: [Feature] Subset of terms to keep: smallest model. - upper_terms: [Variable] + upper_terms: [Feature] Largest possible model. validator: callable Callable taking a single argument: state, @@ -216,9 +216,9 @@ class Stepwise(MinMaxCandidates): Minumum number of terms to select max_terms: int (default: 1) Maximum number of terms to select - lower_terms: [Variable] + lower_terms: [Feature] Subset of terms to keep: smallest model. - upper_terms: [Variable] + upper_terms: [Feature] Largest possible model. constraints: {array-like} (optional), shape [n_terms, n_terms] Boolean matrix decribing a dag with [i,j] nonzero implying that j is @@ -342,9 +342,9 @@ def first_peak(model_spec, Minumum number of terms to select max_terms: int (default: 1) Maximum number of terms to select - lower_terms: [Variable] + lower_terms: [Feature] Subset of terms to keep: smallest model. - upper_terms: [Variable] + upper_terms: [Feature] Largest possible model. initial_terms: column identifiers, default=[] Subset of terms to be used to initialize when direction @@ -441,9 +441,9 @@ def fixed_steps(model_spec, max_terms: int (default: None) Maximum number of terms to select. If None defaults to number of terms in *model_spec*. - lower_terms: [Variable] + lower_terms: [Feature] Subset of terms to keep: smallest model. - upper_terms: [Variable] + upper_terms: [Feature] Largest possible model. initial_terms: column identifiers, default=[] Subset of terms to be used to initialize. @@ -506,9 +506,9 @@ def min_max(model_spec, Minumum number of terms to select max_terms: int (default: 1) Maximum number of terms to select - lower_terms: [Variable] + lower_terms: [Feature] Subset of terms to keep: smallest model. - upper_terms: [Variable] + upper_terms: [Feature] Largest possible model. validator: callable Callable taking a single argument: state, diff --git a/ISLP/survival.py b/ISLP/survival.py index b11967b..c352942 100644 --- a/ISLP/survival.py +++ b/ISLP/survival.py @@ -14,7 +14,7 @@ def sim_time(linpred, cum_hazard, - rng): + rng=None): """ Simulate a survival time for a cumulative hazard function $H$ with cumulative hazard @@ -39,6 +39,9 @@ def sim_time(linpred, Used to generate survival times. """ + if rng is None: + rng = np.random.default_rng() + U = rng.uniform() B = - np.log(U) / np.exp(linpred) lower, upper = 1, 2 diff --git a/ISLP/svm.py b/ISLP/svm.py index bedf288..8afcd5a 100644 --- a/ISLP/svm.py +++ b/ISLP/svm.py @@ -28,6 +28,12 @@ def plot(X, ''' Graphical representation of fitted support vector classifier. + There are two types of support vectors: + + - Points violating the margin but correctly classified. These are marked with a black '+'. + + - Misclassified points. These are marked with a red 'x'. + Parameters ---------- @@ -89,7 +95,7 @@ def plot(X, # draw the points - ax.scatter(X0, X1, c=Y, cmap=scatter_cmap) + ax.scatter(X0, X1, c=Y, cmap=scatter_cmap, s=200) # add the contour @@ -113,8 +119,27 @@ def plot(X, cmap=decision_cmap, alpha=alpha) - # add the support vectors + decision_val = svm.decision_function(X_pred) - ax.scatter(X[svm.support_,features[0]], - X[svm.support_,features[1]], marker='+', c='k', s=200) + # add the support vectors + if svm.classes_.shape[0] == 2: # 2-class problem + + ax.contourf(xval, + yval, + decision_val.reshape(yval.shape), + levels=[-1,1], + cmap=decision_cmap, + alpha=alpha) + + D = svm.decision_function(X[svm.support_]) + Y_ = (2 * (Y[svm.support_] == svm.classes_[1]) - 1) + violate_margin = (Y_ * D) > 0 + ax.scatter(X[svm.support_,features[0]][violate_margin], + X[svm.support_,features[1]][violate_margin], marker='+', c='k', s=50) + misclassified = ~violate_margin + ax.scatter(X[svm.support_,features[0]][misclassified], + X[svm.support_,features[1]][misclassified], marker='x', c='r', s=50) + else: + ax.scatter(X[svm.support_,features[0]], + X[svm.support_,features[1]], marker='+', c='k', s=50) diff --git a/ISLP/torch/imdb.py b/ISLP/torch/imdb.py index 617489d..3dfacfe 100644 --- a/ISLP/torch/imdb.py +++ b/ISLP/torch/imdb.py @@ -12,7 +12,6 @@ import torch from torch.utils.data import TensorDataset from scipy.sparse import load_npz -from pkg_resources import resource_filename from pickle import load as load_pickle import urllib diff --git a/ISLP/torch/lightning.py b/ISLP/torch/lightning.py index 82c45db..d7056ec 100644 --- a/ISLP/torch/lightning.py +++ b/ISLP/torch/lightning.py @@ -7,14 +7,14 @@ DataLoader, Dataset) from torch import tensor, Generator, concat -from torchvision import transforms + from torch.utils.data import TensorDataset from torchmetrics import Accuracy from pytorch_lightning import (LightningModule, LightningDataModule) -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.callbacks import Callback class SimpleDataModule(LightningDataModule): @@ -132,14 +132,15 @@ def __init__(self, model, loss, optimizer=None, - metrics={}, + metrics=None, on_epoch=True, pre_process_y_for_metrics=lambda y: y): super(SimpleModule, self).__init__() self.model = model - self.loss = loss or nn.MSELoss() + self.loss = loss + optimizer = optimizer or RMSprop(model.parameters()) self._optimizer = optimizer self.metrics = metrics @@ -160,8 +161,10 @@ def training_step(self, batch, batch_idx): y_ = self.pre_process_y_for_metrics(y) for _metric in self.metrics.keys(): + pl_metric = self.metrics[_metric] self.log(f"train_{_metric}", - self.metrics[_metric](preds, y_), + pl_metric(preds.to(pl_metric.device), + y_.to(pl_metric.device)), on_epoch=self.on_epoch) return loss @@ -181,22 +184,36 @@ def configure_optimizers(self): @staticmethod def regression(model, + metrics=None, + device='cpu', **kwargs): - loss = nn.MSELoss() + + if metrics is None: + metrics = {} + + loss = nn.MSELoss().to(device) + if device is not None: + for key, metric in metrics.items(): + metrics[key] = metric.to(device) return SimpleModule(model, loss, + metrics=metrics, **kwargs) @staticmethod def binary_classification(model, - metrics={}, - device=None, + metrics=None, + device='cpu', **kwargs): + + if metrics is None: + metrics = {} + loss = nn.BCEWithLogitsLoss() if 'accuracy' not in metrics: - metrics['accuracy'] = Accuracy() + metrics['accuracy'] = Accuracy('binary') if device is not None: - for key, metric in metrics: + for key, metric in metrics.items(): metrics[key] = metric.to(device) return SimpleModule(model, loss, @@ -206,14 +223,20 @@ def binary_classification(model, @staticmethod def classification(model, - metrics={}, - device=None, + num_classes, + metrics=None, + device='cpu', **kwargs): - loss = nn.CrossEntropyLoss() + + if metrics is None: + metrics = {} + + loss = nn.CrossEntropyLoss().to(device) if 'accuracy' not in metrics: - metrics['accuracy'] = Accuracy() + metrics['accuracy'] = Accuracy('multiclass', + num_classes=num_classes) if device is not None: - for key, metric in metrics: + for key, metric in metrics.items(): metrics[key] = metric.to(device) return SimpleModule(model, loss, @@ -233,7 +256,7 @@ def on_validation_batch_start(self, pl_module, batch, batch_idx, - dataloader_idx): + dataloader_idx=0): x, y = batch self.val_preds.append(pl_module.forward(x)) self.val_targets.append(y) @@ -252,8 +275,10 @@ def on_validation_epoch_end(self, on_epoch=pl_module.on_epoch) for _metric in pl_module.metrics.keys(): + pl_metric = pl_module.metrics[_metric] pl_module.log(f"valid_{_metric}", - pl_module.metrics[_metric](preds, targets_), + pl_metric(preds.to(pl_metric.device), + targets_.to(pl_metric.device)), on_epoch=pl_module.on_epoch) def on_test_epoch_start(self, @@ -267,7 +292,7 @@ def on_test_batch_start(self, pl_module, batch, batch_idx, - dataloader_idx): + dataloader_idx=0): x, y = batch self.test_preds.append(pl_module.forward(x)) self.test_targets.append(y) @@ -286,7 +311,9 @@ def on_test_epoch_end(self, on_epoch=pl_module.on_epoch) for _metric in pl_module.metrics.keys(): + pl_metric = pl_module.metrics[_metric] pl_module.log(f"test_{_metric}", - pl_module.metrics[_metric](preds, targets_), + pl_metric(preds.to(pl_metric.device), + targets_.to(pl_metric.device)), on_epoch=pl_module.on_epoch) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dd8ced0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + (1) Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + (2) Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + (3)The name of the author may not be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index eb283fa..546ddba 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,99 @@ # ISLP + +[![All Contributors](https://img.shields.io/badge/all_contributors-3-orange.svg?style=flat-square)](#contributors-) + This package collects data sets and various helper functions for ISLP. ## Install instructions -### Mac OS X +### Mac OS X / Linux + +We generally recommend creating a [conda](https://anaconda.org) environment to isolate any code +from other dependencies. The `ISLP` package does not have unusual dependencies, but this is still +good practice. To create a conda environment in a Mac OS X or Linux environment run: ```{python} -pip install ISLP +conda create --name islp +``` + +To run python code in this environment, you must activate it: + +```{python} +conda activate islp ``` ### Windows -See the [https://packaging.python.org/en/latest/tutorials/installing-packages/#ensure-you-can-run-pip-from-the-command-line](python-packaging-instructions) for a simple way to run `pip` within -Jupyter. +On windows, create a `Python` environment called `islp` in the Anaconda app. This can be done by selecting `Environments` on the left hand side of the app's screen. After creating the environment, open a terminal within that environment by clicking on the "Play" button. -Alternatively, within a python shell, the following commands should install `ISLP`: + +## Installing `ISLP` + +Having completed the steps above, we use `pip` to install the `ISLP` package: ```{python} -import os, sys -cmd = f'{sys.executable} -m pip install ISLP' -os.system(cmd) +pip install ISLP ``` ### Torch requirements The `ISLP` labs use `torch` and various related packages for the lab on deep learning. The requirements -can be found [here](requirements.txt). Alternatively, you can install them directly using `pip` +are included in the requirements for `ISLP` with the exception of those needed +for the labs which are included in the [requirements for the labs](https://github.com/intro-stat-learning/ISLP_labs/blob/main/requirements.txt). + +## Jupyter + +### Mac OS X + +If JupyterLab is not already installed, run the following after having activated your `islp` environment: ```{python} -reqs = 'https://raw.githubusercontent.com/jonathan-taylor/ISLP/master/requirements.txt' -cmd = f'{sys.executable} -m pip install -r {reqs}' -os.system(cmd) +pip install jupyterlab ``` +### Windows + +Either use the same `pip` command above or install JupyterLab from the `Home` tab. Ensure that the environment +is your `islp` environment. This information appears near the top left in the Anaconda `Home` page. + + ## Documentation -See the [read the docs](https://islp.readthedocs.io/en/latest/models.html) +See the [docs](https://intro-stat-learning.github.io/ISLP/labs.html) for the latest documentation. + +## Authors + +- Jonathan Taylor +- Trevor Hastie +- Gareth James +- Robert Tibshirani +- Daniela Witten + + + + +## Contributors ✨ + +Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): + + + + + + + + + + + + +
danielawitten
danielawitten

💻 🖋
trevorhastie
trevorhastie

💻 🖋
tibshirani
tibshirani

💻 🖋
+ + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! \ No newline at end of file diff --git a/docs/ISLP_labs b/docs/ISLP_labs new file mode 160000 index 0000000..5d793a3 --- /dev/null +++ b/docs/ISLP_labs @@ -0,0 +1 @@ +Subproject commit 5d793a33a8d5025181439b8d0f193c37c69ee20a diff --git a/docs/README.rst b/docs/README.rst new file mode 100644 index 0000000..41df584 --- /dev/null +++ b/docs/README.rst @@ -0,0 +1,15 @@ +Deep learning +============= + +This lab should be run as a notebook and saved + +Ridge regression +================ + +There is a snippet that should be inserted to remove the many warnings raised. + +Frozen reqs +=========== + +The versions of the labs are referred to in `source/installation.myst`, `source/labs.rst`. Version built +on `readthedocs` is references in `fix_and_run_notebooks.py` diff --git a/docs/fix_and_clear_notebooks.py b/docs/fix_and_clear_notebooks.py new file mode 100644 index 0000000..50eebe2 --- /dev/null +++ b/docs/fix_and_clear_notebooks.py @@ -0,0 +1,127 @@ + +from dataclasses import dataclass +from copy import copy + +import shlex +import subprocess +import os +import sys +import json +import nbformat +from argparse import ArgumentParser + +def get_version(): + import __main__ + dirname = os.path.split(__main__.__file__)[0] + sys.path.append(os.path.join(dirname, 'source')) + from conf import docs_version + sys.path = sys.path[:-1] + return docs_version + + +@dataclass +class Lab(object): + + labfile: str + version: str = 'v2' + rm_md: bool = True + + def __post_init__(self): + self.labfile = os.path.abspath(self.labfile) + + def fix_header(self): + labname = os.path.split(self.labfile)[1] + base = os.path.splitext(self.labfile)[0] + args = shlex.split(f'jupytext --set-formats ipynb,md:myst {self.labfile}') + subprocess.run(args) + + # successful run of jupytext + myst = open(f'{base}.md').read().strip() + split_myst = myst.split('\n') + new_myst = [] + + colab_code = f''' + +Open In Colab + + +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/{self.version}?labpath={labname}) + +''' + + chapter_buffer = 200 # should use a regex... + for l in split_myst[:chapter_buffer]: # assumes Chapter appears in first 200 linesmyst.split('\n') + if l.strip()[:9] != '# Chapter': # exclude the line with "# Chapter" + if 'Lab:' in l: + l = l.replace('Lab:', '') + '\n' + colab_code + new_myst.append(l) + + myst = '\n'.join(new_myst + split_myst[chapter_buffer:]) + + open(f'{base}.md', 'w').write(myst) + + args = shlex.split(f'jupytext --sync {base}.ipynb') + subprocess.run(args) + + args = shlex.split(f'jupytext --set-formats Rmd,ipynb {base}.ipynb') + subprocess.run(args) + + args = shlex.split(f'jupytext --sync {base}.ipynb') + subprocess.run(args) + + if self.rm_md: + subprocess.run(['rm', f'{base}.md']) + +def fix_Ch06(Ch06_nbfile): + + nb = nbformat.read(open(Ch06_nbfile), 4) + + md_cell = copy(nb.cells[0]) + md_cell['id'] = md_cell['id'] + '_duplicate' + + src = ''' + +```{attention} +Using `skl.ElasticNet` to fit ridge regression +throws up many warnings. We have suppressed them below by a call to `warnings.simplefilter()`. +``` + +''' + + md_cell['source'] = [l +'\n' for l in src.split('\n')] + + for i, cell in enumerate(nb.cells): + if cell['cell_type'] == 'code': + code_cell = copy(cell) + code_cell['id'] = code_cell['id'] + '_duplicate' + code_cell['source'] = ['import warnings\n', 'warnings.simplefilter("ignore")\n'] + break + + nb.cells.insert(i, md_cell) + nb.cells.insert(i+1, code_cell) + + nbformat.write(nb, open(Ch06_nbfile, 'w')) + subprocess.run(shlex.split(f'jupytext --sync {Ch06_nbfile}')) + +if __name__ == "__main__": + + docs_version = get_version() + + parser = ArgumentParser() + parser.add_argument('labs', + metavar='N', + type=str, + nargs='+') + parser.add_argument('--rm_md', + dest='rm_md', + action='store_true', + default=False) + + args = parser.parse_args() + + for labfile in args.labs: + l = Lab(labfile=labfile, version=docs_version['labs']) + l.fix_header() + if '06' in labfile: + fix_Ch06(labfile) + diff --git a/docs/jupyterbook/datasets/Auto.ipynb b/docs/jupyterbook/datasets/Auto.ipynb index f84fbfc..b88ea02 100644 --- a/docs/jupyterbook/datasets/Auto.ipynb +++ b/docs/jupyterbook/datasets/Auto.ipynb @@ -88,9 +88,9 @@ "formats": "source/datasets///ipynb,jupyterbook/datasets///md:myst,jupyterbook/datasets///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Auto.md b/docs/jupyterbook/datasets/Auto.md index fe851ed..627d70b 100644 --- a/docs/jupyterbook/datasets/Auto.md +++ b/docs/jupyterbook/datasets/Auto.md @@ -5,11 +5,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Auto Data diff --git a/docs/jupyterbook/datasets/Bikeshare.ipynb b/docs/jupyterbook/datasets/Bikeshare.ipynb index b0edebc..ddb1053 100644 --- a/docs/jupyterbook/datasets/Bikeshare.ipynb +++ b/docs/jupyterbook/datasets/Bikeshare.ipynb @@ -102,9 +102,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Bikeshare.md b/docs/jupyterbook/datasets/Bikeshare.md index 90e1f7f..380bc1b 100644 --- a/docs/jupyterbook/datasets/Bikeshare.md +++ b/docs/jupyterbook/datasets/Bikeshare.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Bike sharing data diff --git a/docs/jupyterbook/datasets/Boston.ipynb b/docs/jupyterbook/datasets/Boston.ipynb index 1b5dce0..569f5b4 100644 --- a/docs/jupyterbook/datasets/Boston.ipynb +++ b/docs/jupyterbook/datasets/Boston.ipynb @@ -95,9 +95,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Boston.md b/docs/jupyterbook/datasets/Boston.md index 60b6f5e..1146a86 100644 --- a/docs/jupyterbook/datasets/Boston.md +++ b/docs/jupyterbook/datasets/Boston.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Boston Data diff --git a/docs/jupyterbook/datasets/BrainCancer.ipynb b/docs/jupyterbook/datasets/BrainCancer.ipynb index fd8e84e..cb75946 100644 --- a/docs/jupyterbook/datasets/BrainCancer.ipynb +++ b/docs/jupyterbook/datasets/BrainCancer.ipynb @@ -95,9 +95,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/BrainCancer.md b/docs/jupyterbook/datasets/BrainCancer.md index 3e1a2be..7307a69 100644 --- a/docs/jupyterbook/datasets/BrainCancer.md +++ b/docs/jupyterbook/datasets/BrainCancer.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Brain Cancer Data diff --git a/docs/jupyterbook/datasets/Caravan.ipynb b/docs/jupyterbook/datasets/Caravan.ipynb index ad1af58..f093422 100644 --- a/docs/jupyterbook/datasets/Caravan.ipynb +++ b/docs/jupyterbook/datasets/Caravan.ipynb @@ -63,9 +63,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Caravan.md b/docs/jupyterbook/datasets/Caravan.md index a42ddb1..24f8335 100644 --- a/docs/jupyterbook/datasets/Caravan.md +++ b/docs/jupyterbook/datasets/Caravan.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Caravan diff --git a/docs/jupyterbook/datasets/Carseats.ipynb b/docs/jupyterbook/datasets/Carseats.ipynb index 911e767..dfd36d4 100644 --- a/docs/jupyterbook/datasets/Carseats.ipynb +++ b/docs/jupyterbook/datasets/Carseats.ipynb @@ -83,9 +83,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Carseats.md b/docs/jupyterbook/datasets/Carseats.md index 3c74d37..76f56e4 100644 --- a/docs/jupyterbook/datasets/Carseats.md +++ b/docs/jupyterbook/datasets/Carseats.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Sales of Child Car Seats diff --git a/docs/jupyterbook/datasets/College.ipynb b/docs/jupyterbook/datasets/College.ipynb index ef2f53d..af1027d 100644 --- a/docs/jupyterbook/datasets/College.ipynb +++ b/docs/jupyterbook/datasets/College.ipynb @@ -104,9 +104,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/College.md b/docs/jupyterbook/datasets/College.md index 5e2e422..95b0bb3 100644 --- a/docs/jupyterbook/datasets/College.md +++ b/docs/jupyterbook/datasets/College.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # U.S. News and World Report's College Data diff --git a/docs/jupyterbook/datasets/Credit.ipynb b/docs/jupyterbook/datasets/Credit.ipynb index c4c79b5..f5e51a9 100644 --- a/docs/jupyterbook/datasets/Credit.ipynb +++ b/docs/jupyterbook/datasets/Credit.ipynb @@ -89,9 +89,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Credit.md b/docs/jupyterbook/datasets/Credit.md index 36d2502..51de59d 100644 --- a/docs/jupyterbook/datasets/Credit.md +++ b/docs/jupyterbook/datasets/Credit.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Credit Card Balance Data diff --git a/docs/jupyterbook/datasets/Default.ipynb b/docs/jupyterbook/datasets/Default.ipynb index 4799474..64357ef 100644 --- a/docs/jupyterbook/datasets/Default.ipynb +++ b/docs/jupyterbook/datasets/Default.ipynb @@ -83,9 +83,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Default.md b/docs/jupyterbook/datasets/Default.md index f1c9acc..5aeaed2 100644 --- a/docs/jupyterbook/datasets/Default.md +++ b/docs/jupyterbook/datasets/Default.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Credit Card Default Data diff --git a/docs/jupyterbook/datasets/Fund.ipynb b/docs/jupyterbook/datasets/Fund.ipynb index 905528d..fce1859 100644 --- a/docs/jupyterbook/datasets/Fund.ipynb +++ b/docs/jupyterbook/datasets/Fund.ipynb @@ -51,9 +51,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Fund.md b/docs/jupyterbook/datasets/Fund.md index 4e53d4f..89009c2 100644 --- a/docs/jupyterbook/datasets/Fund.md +++ b/docs/jupyterbook/datasets/Fund.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Fund Manager Data diff --git a/docs/jupyterbook/datasets/Hitters.ipynb b/docs/jupyterbook/datasets/Hitters.ipynb index 295f50b..6f261cd 100644 --- a/docs/jupyterbook/datasets/Hitters.ipynb +++ b/docs/jupyterbook/datasets/Hitters.ipynb @@ -110,9 +110,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Hitters.md b/docs/jupyterbook/datasets/Hitters.md index 7f8d6b7..2fdecf0 100644 --- a/docs/jupyterbook/datasets/Hitters.md +++ b/docs/jupyterbook/datasets/Hitters.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Baseball Data diff --git a/docs/jupyterbook/datasets/Khan.ipynb b/docs/jupyterbook/datasets/Khan.ipynb index a1f89a4..f12a5ca 100644 --- a/docs/jupyterbook/datasets/Khan.ipynb +++ b/docs/jupyterbook/datasets/Khan.ipynb @@ -81,9 +81,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Khan.md b/docs/jupyterbook/datasets/Khan.md index f943e99..6f0c303 100644 --- a/docs/jupyterbook/datasets/Khan.md +++ b/docs/jupyterbook/datasets/Khan.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Khan Gene Data diff --git a/docs/jupyterbook/datasets/NCI60.ipynb b/docs/jupyterbook/datasets/NCI60.ipynb index d8e2aec..bbb576f 100644 --- a/docs/jupyterbook/datasets/NCI60.ipynb +++ b/docs/jupyterbook/datasets/NCI60.ipynb @@ -62,9 +62,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/NCI60.md b/docs/jupyterbook/datasets/NCI60.md index 4cc96c6..621445e 100644 --- a/docs/jupyterbook/datasets/NCI60.md +++ b/docs/jupyterbook/datasets/NCI60.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # NCI 60 Data diff --git a/docs/jupyterbook/datasets/NYSE.ipynb b/docs/jupyterbook/datasets/NYSE.ipynb index d884201..5f9dbd5 100644 --- a/docs/jupyterbook/datasets/NYSE.ipynb +++ b/docs/jupyterbook/datasets/NYSE.ipynb @@ -79,9 +79,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/NYSE.md b/docs/jupyterbook/datasets/NYSE.md index a84a9d4..bdb9581 100644 --- a/docs/jupyterbook/datasets/NYSE.md +++ b/docs/jupyterbook/datasets/NYSE.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # New York Stock Exchange Data diff --git a/docs/jupyterbook/datasets/OJ.ipynb b/docs/jupyterbook/datasets/OJ.ipynb index 30046cb..e18a4de 100644 --- a/docs/jupyterbook/datasets/OJ.ipynb +++ b/docs/jupyterbook/datasets/OJ.ipynb @@ -107,9 +107,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/OJ.md b/docs/jupyterbook/datasets/OJ.md index 8681ea9..94fd7c6 100644 --- a/docs/jupyterbook/datasets/OJ.md +++ b/docs/jupyterbook/datasets/OJ.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Orange Juice Data diff --git a/docs/jupyterbook/datasets/Portfolio.ipynb b/docs/jupyterbook/datasets/Portfolio.ipynb index 0596162..6d6a60d 100644 --- a/docs/jupyterbook/datasets/Portfolio.ipynb +++ b/docs/jupyterbook/datasets/Portfolio.ipynb @@ -68,9 +68,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Portfolio.md b/docs/jupyterbook/datasets/Portfolio.md index e130b81..3a79d35 100644 --- a/docs/jupyterbook/datasets/Portfolio.md +++ b/docs/jupyterbook/datasets/Portfolio.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Portfolio Data diff --git a/docs/jupyterbook/datasets/Publication.ipynb b/docs/jupyterbook/datasets/Publication.ipynb index c97b201..a4a6dfa 100644 --- a/docs/jupyterbook/datasets/Publication.ipynb +++ b/docs/jupyterbook/datasets/Publication.ipynb @@ -91,9 +91,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Publication.md b/docs/jupyterbook/datasets/Publication.md index 94c18bd..78261af 100644 --- a/docs/jupyterbook/datasets/Publication.md +++ b/docs/jupyterbook/datasets/Publication.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Time-to-Publication Data diff --git a/docs/jupyterbook/datasets/Smarket.ipynb b/docs/jupyterbook/datasets/Smarket.ipynb index 35a1918..cced2a9 100644 --- a/docs/jupyterbook/datasets/Smarket.ipynb +++ b/docs/jupyterbook/datasets/Smarket.ipynb @@ -87,9 +87,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Smarket.md b/docs/jupyterbook/datasets/Smarket.md index a42c94e..2c0e120 100644 --- a/docs/jupyterbook/datasets/Smarket.md +++ b/docs/jupyterbook/datasets/Smarket.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # S&P Stock Market Data diff --git a/docs/jupyterbook/datasets/USArrests.ipynb b/docs/jupyterbook/datasets/USArrests.ipynb index 4a6a1c0..1107424 100644 --- a/docs/jupyterbook/datasets/USArrests.ipynb +++ b/docs/jupyterbook/datasets/USArrests.ipynb @@ -202,9 +202,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/jupyterbook/datasets/USArrests.md b/docs/jupyterbook/datasets/USArrests.md index 7cbede1..ee3c962 100644 --- a/docs/jupyterbook/datasets/USArrests.md +++ b/docs/jupyterbook/datasets/USArrests.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Violent Crime Rates by US State diff --git a/docs/jupyterbook/datasets/Wage.ipynb b/docs/jupyterbook/datasets/Wage.ipynb index ad8f9b0..b95d853 100644 --- a/docs/jupyterbook/datasets/Wage.ipynb +++ b/docs/jupyterbook/datasets/Wage.ipynb @@ -99,9 +99,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Wage.md b/docs/jupyterbook/datasets/Wage.md index eeeb3c4..fd22e30 100644 --- a/docs/jupyterbook/datasets/Wage.md +++ b/docs/jupyterbook/datasets/Wage.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Mid-Atlantic Wage Data diff --git a/docs/jupyterbook/datasets/Weekly.ipynb b/docs/jupyterbook/datasets/Weekly.ipynb index cf08b80..69f26d6 100644 --- a/docs/jupyterbook/datasets/Weekly.ipynb +++ b/docs/jupyterbook/datasets/Weekly.ipynb @@ -95,9 +95,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/datasets/Weekly.md b/docs/jupyterbook/datasets/Weekly.md index c0639ea..c239c5e 100644 --- a/docs/jupyterbook/datasets/Weekly.md +++ b/docs/jupyterbook/datasets/Weekly.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Weekly S&P Stock Market Data diff --git a/docs/jupyterbook/helpers/cluster.ipynb b/docs/jupyterbook/helpers/cluster.ipynb index bf237a3..31798a0 100644 --- a/docs/jupyterbook/helpers/cluster.ipynb +++ b/docs/jupyterbook/helpers/cluster.ipynb @@ -8,15 +8,27 @@ "# Clustering\n", "\n", "This module has a single function, used to help visualize a dendrogram from a\n", - "hierarchical clustering." + "hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d5df152d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'sklearn'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AgglomerativeClustering\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhierarchy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dendrogram\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mISLP\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compute_linkage\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'" + ] + } + ], "source": [ "import numpy as np\n", "from sklearn.cluster import AgglomerativeClustering\n", @@ -34,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0135c1fb", "metadata": {}, "outputs": [], @@ -101,9 +113,21 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "islp_test" + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" } }, "nbformat": 4, diff --git a/docs/jupyterbook/helpers/cluster.md b/docs/jupyterbook/helpers/cluster.md index ab31348..b951b18 100644 --- a/docs/jupyterbook/helpers/cluster.md +++ b/docs/jupyterbook/helpers/cluster.md @@ -7,19 +7,19 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: Python 3 (ipykernel) language: python - name: islp_test + name: python3 --- # Clustering This module has a single function, used to help visualize a dendrogram from a -hierarchical clustering. +hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html). -```{code-cell} +```{code-cell} ipython3 import numpy as np from sklearn.cluster import AgglomerativeClustering from scipy.cluster.hierarchy import dendrogram @@ -28,7 +28,7 @@ from ISLP.cluster import compute_linkage ## Make a toy dataset -```{code-cell} +```{code-cell} ipython3 rng = np.random.default_rng(1) X = rng.normal(size=(30, 5)) X[:10] += 1 @@ -36,19 +36,19 @@ X[:10] += 1 ## Cluster it -```{code-cell} +```{code-cell} ipython3 clust = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='complete') ``` -```{code-cell} +```{code-cell} ipython3 clust.fit(X) ``` ## Plot the dendrogram -```{code-cell} +```{code-cell} ipython3 linkage = compute_linkage(clust) dendrogram(linkage); ``` diff --git a/docs/jupyterbook/helpers/pygam.ipynb b/docs/jupyterbook/helpers/pygam.ipynb index 01a1e55..aab61d1 100644 --- a/docs/jupyterbook/helpers/pygam.ipynb +++ b/docs/jupyterbook/helpers/pygam.ipynb @@ -207,9 +207,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/helpers/pygam.md b/docs/jupyterbook/helpers/pygam.md index c91084c..56adc84 100644 --- a/docs/jupyterbook/helpers/pygam.md +++ b/docs/jupyterbook/helpers/pygam.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Generalized Additive Models diff --git a/docs/jupyterbook/helpers/survival.ipynb b/docs/jupyterbook/helpers/survival.ipynb index e6b9e3a..7cb30a3 100644 --- a/docs/jupyterbook/helpers/survival.ipynb +++ b/docs/jupyterbook/helpers/survival.ipynb @@ -108,9 +108,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/helpers/survival.md b/docs/jupyterbook/helpers/survival.md index 715f8bd..58b129d 100644 --- a/docs/jupyterbook/helpers/survival.md +++ b/docs/jupyterbook/helpers/survival.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Survival Analysis diff --git a/docs/jupyterbook/helpers/svm.ipynb b/docs/jupyterbook/helpers/svm.ipynb index dac6c39..593d840 100644 --- a/docs/jupyterbook/helpers/svm.ipynb +++ b/docs/jupyterbook/helpers/svm.ipynb @@ -103,9 +103,9 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" } }, "nbformat": 4, diff --git a/docs/jupyterbook/helpers/svm.md b/docs/jupyterbook/helpers/svm.md index 007eb7a..3025490 100644 --- a/docs/jupyterbook/helpers/svm.md +++ b/docs/jupyterbook/helpers/svm.md @@ -7,11 +7,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Support Vector Machines diff --git a/docs/jupyterbook/imdb.ipynb b/docs/jupyterbook/imdb.ipynb index d490921..ae0d7dd 100644 --- a/docs/jupyterbook/imdb.ipynb +++ b/docs/jupyterbook/imdb.ipynb @@ -5,71 +5,109 @@ "id": "50f2b809", "metadata": {}, "source": [ - "# Creating a clean IMDB dataset\n", + "# Creating IMDB dataset from `keras` version\n", + "\n", + "This script details how the `IMDB` data in `ISLP` was constructed.\n", "\n", "Running this example requires `keras`. Use `pip install keras` to install if necessary." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d920bb2e", "metadata": {}, "outputs": [], "source": [ - "import pickle" + "import pickle\n", + "import numpy as np\n", + "from scipy.sparse import coo_matrix, save_npz\n", + "import torch\n", + "from keras.datasets import imdb\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e507f1fb", + "cell_type": "markdown", + "id": "eaf27f0c-0cb0-4ad5-8775-d138e3f20933", "metadata": {}, - "outputs": [], "source": [ - "import numpy as np\n", - "from scipy.sparse import coo_matrix, save_npz\n", - "import torch" + "We first load the data using `keras`, limiting focus to the 10000 most commmon words." ] }, { "cell_type": "code", - "execution_count": null, - "id": "b94d3f35", + "execution_count": 2, + "id": "29f0e01e", "metadata": {}, "outputs": [], "source": [ - "from keras.datasets import imdb\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences" + "# the 3 is for three terms: \n", + "num_words = 10000+3\n", + "((S_train, L_train), \n", + " (S_test, L_test)) = imdb.load_data(num_words=num_words)" + ] + }, + { + "cell_type": "markdown", + "id": "9020ab27-cc62-4b86-85ba-80a94ff692de", + "metadata": {}, + "source": [ + "The object `S_train` is effectively a list in which each document has been encoded into a sequence of\n", + "values from 0 to 10002." ] }, { "cell_type": "code", - "execution_count": null, - "id": "29f0e01e", + "execution_count": 3, + "id": "e27564c4-320f-42b6-9f2e-2a2afdebefcf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# the 3 is for three terms: \n", - "num_words = 10000+3\n", - "((S_train, Y_train), \n", - " (S_test, Y_test)) = imdb.load_data(num_words=num_words)" + "S_train[0][:10]" + ] + }, + { + "cell_type": "markdown", + "id": "15f039fe-faed-4884-a725-1c51d6c8d4d4", + "metadata": {}, + "source": [ + "We'll use `np.float32` as that is the common precision used in `torch`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "6cc3c3cb", "metadata": {}, "outputs": [], "source": [ - "Y_train = Y_train.astype(np.float32)\n", - "Y_test = Y_test.astype(np.float32)" + "L_train = L_train.astype(np.float32)\n", + "L_test = L_test.astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "id": "005679bc-4337-4757-831e-f9a6ea50f6aa", + "metadata": {}, + "source": [ + "We will use a one-hot encoding that captures whether or not a given word appears in a given review." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7b6d1098", "metadata": {}, "outputs": [], @@ -88,18 +126,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "afcdc8b2", "metadata": {}, "outputs": [], "source": [ - "X_train, L_train = one_hot(S_train, num_words), Y_train\n", + "X_train = one_hot(S_train, num_words)\n", "X_test = one_hot(S_test, num_words)" ] }, + { + "cell_type": "markdown", + "id": "a67e299d-8774-4758-8953-77afdce775ab", + "metadata": {}, + "source": [ + "## Store as sparse tensors\n", + "\n", + "We see later in the lab that the dense representation is faster. Nevertheless,\n", + "let's store the one-hot representation as sparse `torch` tensors \n", + "as well as sparse `scipy` matrices." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b19366ea", "metadata": {}, "outputs": [], @@ -115,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "b45ae6d1", "metadata": {}, "outputs": [], @@ -126,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a47d6eb6", "metadata": {}, "outputs": [], @@ -137,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "d1b37b37", "metadata": {}, "outputs": [], @@ -151,12 +201,12 @@ "id": "1119823a", "metadata": {}, "source": [ - "save the sparse matrices" + "### Save as sparse `scipy` matrices" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "6cb6bfdf", "metadata": {}, "outputs": [], @@ -167,12 +217,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "eac1c2ae", "metadata": {}, "outputs": [], "source": [ - "np.save('IMDB_Y_test.npy', Y_test)\n", + "np.save('IMDB_Y_test.npy', L_test)\n", "np.save('IMDB_Y_train.npy', L_train)" ] }, @@ -181,12 +231,14 @@ "id": "25c128e3", "metadata": {}, "source": [ - "save and pickle the word index" + "## Save and pickle the word index\n", + "\n", + "We'll also want to store a lookup table to convert representations such as `S_train[0]` into words" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "8458bf67", "metadata": {}, "outputs": [], @@ -199,9 +251,46 @@ "lookup[4] = \"\"" ] }, + { + "cell_type": "markdown", + "id": "5e62ebff-2575-4d35-b46c-51c6f7598efc", + "metadata": {}, + "source": [ + "Let's look at our first training document:" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "id": "2aaefdf8-0a49-4bdb-8b40-55665283c8a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\" this film was just brilliant casting location scenery story direction everyone's really suited part they played and you\"" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "' '.join([lookup[i] for i in S_train[0][:20]])" + ] + }, + { + "cell_type": "markdown", + "id": "0e985a73-bfd9-42bd-a523-3dc6e223d602", + "metadata": {}, + "source": [ + "We save this lookup table so it can be loaded later " + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "d95252de", "metadata": {}, "outputs": [], @@ -214,12 +303,15 @@ "id": "b3d900b9", "metadata": {}, "source": [ - "create the padded representations" + "## Padded representations\n", + "\n", + "For some of the recurrent models, we'll need sequences of common lengths, padded if necessary.\n", + "Here, we pad up to a maximum length of 500, filling the remaining entries with 0." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "637b3c5e", "metadata": {}, "outputs": [], @@ -230,9 +322,17 @@ " S_test]]" ] }, + { + "cell_type": "markdown", + "id": "a6218300-b355-44cc-b7fb-4bff81211aa6", + "metadata": {}, + "source": [ + "Finally, we save these for later use in the deep learning lab." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "bac69f88", "metadata": {}, "outputs": [], @@ -249,9 +349,21 @@ "main_language": "python" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" } }, "nbformat": 4, diff --git a/docs/jupyterbook/imdb.md b/docs/jupyterbook/imdb.md index 313952f..0b87bae 100644 --- a/docs/jupyterbook/imdb.md +++ b/docs/jupyterbook/imdb.md @@ -7,45 +7,54 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- -# Creating a clean IMDB dataset +# Creating IMDB dataset from `keras` version + +This script details how the `IMDB` data in `ISLP` was constructed. Running this example requires `keras`. Use `pip install keras` to install if necessary. -```{code-cell} +```{code-cell} ipython3 import pickle -``` - -```{code-cell} import numpy as np from scipy.sparse import coo_matrix, save_npz import torch -``` - -```{code-cell} from keras.datasets import imdb from tensorflow.keras.preprocessing.sequence import pad_sequences ``` -```{code-cell} +We first load the data using `keras`, limiting focus to the 10000 most commmon words. + +```{code-cell} ipython3 # the 3 is for three terms: num_words = 10000+3 -((S_train, Y_train), - (S_test, Y_test)) = imdb.load_data(num_words=num_words) +((S_train, L_train), + (S_test, L_test)) = imdb.load_data(num_words=num_words) ``` -```{code-cell} -Y_train = Y_train.astype(np.float32) -Y_test = Y_test.astype(np.float32) +The object `S_train` is effectively a list in which each document has been encoded into a sequence of +values from 0 to 10002. + +```{code-cell} ipython3 +S_train[0][:10] +``` + +We'll use `np.float32` as that is the common precision used in `torch`. + +```{code-cell} ipython3 +L_train = L_train.astype(np.float32) +L_test = L_test.astype(np.float32) ``` -```{code-cell} +We will use a one-hot encoding that captures whether or not a given word appears in a given review. + +```{code-cell} ipython3 def one_hot(sequences, ncol): idx, vals = [], [] for i, s in enumerate(sequences): @@ -58,12 +67,18 @@ def one_hot(sequences, ncol): return tens.coalesce() ``` -```{code-cell} -X_train, L_train = one_hot(S_train, num_words), Y_train +```{code-cell} ipython3 +X_train = one_hot(S_train, num_words) X_test = one_hot(S_test, num_words) ``` -```{code-cell} +## Store as sparse tensors + +We see later in the lab that the dense representation is faster. Nevertheless, +let's store the one-hot representation as sparse `torch` tensors +as well as sparse `scipy` matrices. + +```{code-cell} ipython3 def convert_sparse_tensor(X): idx = np.asarray(X.indices()) vals = np.asarray(X.values()) @@ -73,36 +88,38 @@ def convert_sparse_tensor(X): shape=X.shape).tocsr() ``` -```{code-cell} +```{code-cell} ipython3 X_train_s = convert_sparse_tensor(X_train) X_test_s = convert_sparse_tensor(X_test) ``` -```{code-cell} +```{code-cell} ipython3 X_train_d = torch.tensor(X_train_s.todense()) X_test_d = torch.tensor(X_test_s.todense()) ``` -```{code-cell} +```{code-cell} ipython3 torch.save(X_train_d, 'IMDB_X_train.tensor') torch.save(X_test_d, 'IMDB_X_test.tensor') ``` -save the sparse matrices +### Save as sparse `scipy` matrices -```{code-cell} +```{code-cell} ipython3 save_npz('IMDB_X_test.npz', X_test_s) save_npz('IMDB_X_train.npz', X_train_s) ``` -```{code-cell} -np.save('IMDB_Y_test.npy', Y_test) +```{code-cell} ipython3 +np.save('IMDB_Y_test.npy', L_test) np.save('IMDB_Y_train.npy', L_train) ``` -save and pickle the word index +## Save and pickle the word index -```{code-cell} +We'll also want to store a lookup table to convert representations such as `S_train[0]` into words + +```{code-cell} ipython3 word_index = imdb.get_word_index() lookup = {(i+3):w for w, i in word_index.items()} lookup[0] = "" @@ -111,20 +128,33 @@ lookup[2] = "" lookup[4] = "" ``` -```{code-cell} +Let's look at our first training document: + +```{code-cell} ipython3 +' '.join([lookup[i] for i in S_train[0][:20]]) +``` + +We save this lookup table so it can be loaded later + +```{code-cell} ipython3 pickle.dump(lookup, open('IMDB_word_index.pkl', 'bw')) ``` -create the padded representations +## Padded representations -```{code-cell} +For some of the recurrent models, we'll need sequences of common lengths, padded if necessary. +Here, we pad up to a maximum length of 500, filling the remaining entries with 0. + +```{code-cell} ipython3 (S_train, S_test) = [torch.tensor(pad_sequences(S, maxlen=500, value=0)) for S in [S_train, S_test]] ``` -```{code-cell} +Finally, we save these for later use in the deep learning lab. + +```{code-cell} ipython3 torch.save(S_train, 'IMDB_S_train.tensor') torch.save(S_test, 'IMDB_S_test.tensor') ``` diff --git a/docs/jupyterbook/models/anova.ipynb b/docs/jupyterbook/models/anova.ipynb new file mode 100644 index 0000000..41e8bcb --- /dev/null +++ b/docs/jupyterbook/models/anova.ipynb @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ee33d364", + "metadata": {}, + "source": [ + "# ANOVA using `ModelSpec`\n", + "\n", + "\n", + "In this lab we illustrate how to run create specific ANOVA analyses\n", + "using `ModelSpec`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c70fbaa", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from statsmodels.api import OLS\n", + "from statsmodels.stats.anova import anova_lm\n", + "\n", + "from ISLP import load_data\n", + "from ISLP.models import (ModelSpec,\n", + " derived_feature,\n", + " summarize)" + ] + }, + { + "cell_type": "markdown", + "id": "333a49cf", + "metadata": {}, + "source": [ + "We will carry out two simple ANOVA analyses of the `Hitters` data.\n", + "We wish to predict a baseball player’s `Salary` on the\n", + "basis of various statistics associated with performance in the\n", + "previous year." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8a708215", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "59" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Hitters = load_data('Hitters')\n", + "np.isnan(Hitters['Salary']).sum()" + ] + }, + { + "cell_type": "markdown", + "id": "dad5e991", + "metadata": {}, + "source": [ + " \n", + " We see that `Salary` is missing for 59 players. The\n", + "`dropna()` method of data frames removes all of the rows that have missing\n", + "values in any variable (by default --- see `Hitters.dropna?`)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ac7086a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n", + " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n", + " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Hitters = Hitters.dropna()\n", + "Hitters.columns" + ] + }, + { + "cell_type": "markdown", + "id": "1a0a3521-be74-40df-a404-3895d80a11dc", + "metadata": {}, + "source": [ + "## Grouping variables\n", + "\n", + "A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows\n", + "that there are both career and 1986 offensive stats, as well as some defensive stats.\n", + "\n", + "Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a215e43b-7bc8-4bdd-91cf-40d717cd7978", + "metadata": {}, + "outputs": [], + "source": [ + "confounders = derived_feature(['Division', 'League', 'NewLeague'],\n", + " name='confounders')\n", + "offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'],\n", + " name='offense_career')\n", + "offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'],\n", + " name='offense_1986')\n", + "defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'],\n", + " name='defense_1986')" + ] + }, + { + "cell_type": "markdown", + "id": "aa15fd0c-1e8a-431e-8425-c61da8439976", + "metadata": {}, + "source": [ + "We'll first do a sequential ANOVA where terms are added sequentially" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "40cd6c28", + "metadata": {}, + "outputs": [], + "source": [ + "design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters)\n", + "Y = np.array(Hitters['Salary'])\n", + "X = design.transform(Hitters)" + ] + }, + { + "cell_type": "markdown", + "id": "074120b1", + "metadata": {}, + "source": [ + "Along with a score we need to specify the search strategy. This is done through the object\n", + "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n", + "runs forward stepwise until any further additions to the model do not result\n", + "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n", + "runs a fixed number of steps of stepwise search." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e65f5607", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
coefstd errtP>|t|
intercept148.218773.5952.0140.045
Division[W]-116.040440.188-2.8870.004
League[N]63.750379.0060.8070.421
NewLeague[N]-24.398978.843-0.3090.757
CAtBat-0.18870.120-1.5720.117
CHits0.16360.6650.2460.806
CHmRun-0.15171.612-0.0940.925
CRuns1.47160.7471.9710.050
CRBI0.80210.6911.1610.247
CWalks-0.81240.327-2.4810.014
PutOuts0.28270.0773.6610.000
Assists0.37550.2201.7050.089
Errors-3.29404.377-0.7530.452
AtBat-1.95090.624-3.1250.002
Hits7.43952.3633.1480.002
HmRun4.34496.1900.7020.483
Runs-2.33122.971-0.7850.433
RBI-1.06702.595-0.4110.681
Walks6.21961.8253.4090.001
\n", + "
" + ], + "text/plain": [ + " coef std err t P>|t|\n", + "intercept 148.2187 73.595 2.014 0.045\n", + "Division[W] -116.0404 40.188 -2.887 0.004\n", + "League[N] 63.7503 79.006 0.807 0.421\n", + "NewLeague[N] -24.3989 78.843 -0.309 0.757\n", + "CAtBat -0.1887 0.120 -1.572 0.117\n", + "CHits 0.1636 0.665 0.246 0.806\n", + "CHmRun -0.1517 1.612 -0.094 0.925\n", + "CRuns 1.4716 0.747 1.971 0.050\n", + "CRBI 0.8021 0.691 1.161 0.247\n", + "CWalks -0.8124 0.327 -2.481 0.014\n", + "PutOuts 0.2827 0.077 3.661 0.000\n", + "Assists 0.3755 0.220 1.705 0.089\n", + "Errors -3.2940 4.377 -0.753 0.452\n", + "AtBat -1.9509 0.624 -3.125 0.002\n", + "Hits 7.4395 2.363 3.148 0.002\n", + "HmRun 4.3449 6.190 0.702 0.483\n", + "Runs -2.3312 2.971 -0.785 0.433\n", + "RBI -1.0670 2.595 -0.411 0.681\n", + "Walks 6.2196 1.825 3.409 0.001" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "M = OLS(Y, X).fit()\n", + "summarize(M)" + ] + }, + { + "cell_type": "markdown", + "id": "29d9b55f", + "metadata": {}, + "source": [ + "We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares\n", + "two successive models." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cfbe5b92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
df_residssrdf_diffss_diffFPr(>F)
intercept262.05.331911e+070.0NaNNaNNaN
confounders259.05.131263e+073.02.006478e+066.7411472.144265e-04
offense_career253.03.059130e+076.02.072134e+0734.8086561.470455e-30
defense_1986250.02.730614e+073.03.285156e+0611.0371117.880207e-07
offense_1986244.02.420857e+076.03.097572e+065.2034444.648586e-05
\n", + "
" + ], + "text/plain": [ + " df_resid ssr df_diff ss_diff F \\\n", + "intercept 262.0 5.331911e+07 0.0 NaN NaN \n", + "confounders 259.0 5.131263e+07 3.0 2.006478e+06 6.741147 \n", + "offense_career 253.0 3.059130e+07 6.0 2.072134e+07 34.808656 \n", + "defense_1986 250.0 2.730614e+07 3.0 3.285156e+06 11.037111 \n", + "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n", + "\n", + " Pr(>F) \n", + "intercept NaN \n", + "confounders 2.144265e-04 \n", + "offense_career 1.470455e-30 \n", + "defense_1986 7.880207e-07 \n", + "offense_1986 4.648586e-05 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')])\n", + "df.index = design.names\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "7092f666", + "metadata": {}, + "source": [ + "We can similarly compute the Type II ANOVA results which drops each term and compares to the full model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e2d43844", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
df_residssrdf_diffss_diffFPr(>F)
intercept244.02.420857e+071.04.024254e+054.0560764.511037e-02
confounders244.02.420857e+073.09.661738e+053.2460462.261572e-02
offense_career244.02.420857e+076.01.051025e+0717.6555965.701196e-17
defense_1986244.02.420857e+073.01.467933e+064.9318032.415732e-03
offense_1986244.02.420857e+076.03.097572e+065.2034444.648586e-05
\n", + "
" + ], + "text/plain": [ + " df_resid ssr df_diff ss_diff F \\\n", + "intercept 244.0 2.420857e+07 1.0 4.024254e+05 4.056076 \n", + "confounders 244.0 2.420857e+07 3.0 9.661738e+05 3.246046 \n", + "offense_career 244.0 2.420857e+07 6.0 1.051025e+07 17.655596 \n", + "defense_1986 244.0 2.420857e+07 3.0 1.467933e+06 4.931803 \n", + "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n", + "\n", + " Pr(>F) \n", + "intercept 4.511037e-02 \n", + "confounders 2.261572e-02 \n", + "offense_career 5.701196e-17 \n", + "defense_1986 2.415732e-03 \n", + "offense_1986 4.648586e-05 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "D_full = design.transform(Hitters)\n", + "OLS_full = OLS(Y, D_full).fit()\n", + "dfs = []\n", + "for d in design.build_sequence(Hitters, anova_type='drop'):\n", + " dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:])\n", + "df = pd.concat(dfs)\n", + "df.index = design.names\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "362709ae-9558-4c4c-8f5e-f8388caf631d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" + }, + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/jupyterbook/models/anova.md b/docs/jupyterbook/models/anova.md new file mode 100644 index 0000000..574f9eb --- /dev/null +++ b/docs/jupyterbook/models/anova.md @@ -0,0 +1,115 @@ +--- +jupytext: + formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.14.5 +kernelspec: + display_name: python3 + language: python + name: python3 +--- + +# ANOVA using `ModelSpec` + + +In this lab we illustrate how to run create specific ANOVA analyses +using `ModelSpec`. + +```{code-cell} ipython3 +import numpy as np +import pandas as pd + +from statsmodels.api import OLS +from statsmodels.stats.anova import anova_lm + +from ISLP import load_data +from ISLP.models import (ModelSpec, + derived_feature, + summarize) +``` + +We will carry out two simple ANOVA analyses of the `Hitters` data. +We wish to predict a baseball player’s `Salary` on the +basis of various statistics associated with performance in the +previous year. + +```{code-cell} ipython3 +Hitters = load_data('Hitters') +np.isnan(Hitters['Salary']).sum() +``` + + + We see that `Salary` is missing for 59 players. The +`dropna()` method of data frames removes all of the rows that have missing +values in any variable (by default --- see `Hitters.dropna?`). + +```{code-cell} ipython3 +Hitters = Hitters.dropna() +Hitters.columns +``` + +## Grouping variables + +A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows +that there are both career and 1986 offensive stats, as well as some defensive stats. + +Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables. + +```{code-cell} ipython3 +confounders = derived_feature(['Division', 'League', 'NewLeague'], + name='confounders') +offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'], + name='offense_career') +offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'], + name='offense_1986') +defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'], + name='defense_1986') +``` + +We'll first do a sequential ANOVA where terms are added sequentially + +```{code-cell} ipython3 +design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters) +Y = np.array(Hitters['Salary']) +X = design.transform(Hitters) +``` + +Along with a score we need to specify the search strategy. This is done through the object +`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()` +runs forward stepwise until any further additions to the model do not result +in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()` +runs a fixed number of steps of stepwise search. + +```{code-cell} ipython3 +M = OLS(Y, X).fit() +summarize(M) +``` + +We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares +two successive models. + +```{code-cell} ipython3 +df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')]) +df.index = design.names +df +``` + +We can similarly compute the Type II ANOVA results which drops each term and compares to the full model. + +```{code-cell} ipython3 +D_full = design.transform(Hitters) +OLS_full = OLS(Y, D_full).fit() +dfs = [] +for d in design.build_sequence(Hitters, anova_type='drop'): + dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:]) +df = pd.concat(dfs) +df.index = design.names +df +``` + +```{code-cell} ipython3 + +``` diff --git a/docs/jupyterbook/models/derived.ipynb b/docs/jupyterbook/models/derived.ipynb deleted file mode 100644 index 92fc096..0000000 --- a/docs/jupyterbook/models/derived.ipynb +++ /dev/null @@ -1,2125 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "38217f02", - "metadata": {}, - "source": [ - "# Building design matrices with `ModelSpec`\n", - "\n", - "Force rebuild" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3107d1f9", - "metadata": {}, - "outputs": [], - "source": [ - "x=4\n", - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", - "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cdc46a4e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "e0a2a83a", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "68b40caf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "35558d88", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e5e81a95", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "4bbf9e13", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1ad729b3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d05e9ec8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "b4e9ee33", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "64ac65d3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "620f0e01", - "metadata": {}, - "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "77b898e0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "4580a6bf", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c2dab855", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['OIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5e7963d6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "markdown", - "id": "6b689966", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ff3b96b6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "7e87da20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" - ] - }, - { - "cell_type": "markdown", - "id": "4f2030ac", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "27fc4fb3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "16316981", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "ef3f2bd0", - "metadata": {}, - "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "dd9c7fa6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" - ] - }, - { - "cell_type": "markdown", - "id": "5fc4cc45", - "metadata": {}, - "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "49d7fb46", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc0fe9", - "metadata": {}, - "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "cf6f3f4c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "1552d19a", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "12d955dd", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "f5ea292d", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "ae2af29b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "57305dbe", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "89656ec4", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "f6cb8167", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "547cb625", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ff5b41d5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "932759cf", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "e2190b00", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "6545c5da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "cd088b51", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "8f37ae20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "184aefc2", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "e4134980", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "53808f3b", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "62059c57", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "ded12f69", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "fbb509d1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "f01391e4", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "10df55ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "b43099fb", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "50bce64d", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "2eb2ff16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "6686dff8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "0e0eafd7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "43cce209", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "99bf408e", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "11c19ebf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "4b48e5d2", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "81f641ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "79f7eb4d", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "2afb3b5d", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "c44692ab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "c0bfb2a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "d263056c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "edf0dc68", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "82071a54", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "cd18a4a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "229fa32d", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "b8c52dbb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "e7f93464", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "4094c01f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "d448c9ca", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "634e05c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "4c09c93f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "48c1989f", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "85a28d87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "e17c8a9d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3)[0] 10.036373\n", - "poly(Income, 3)[1] -2.799156\n", - "poly(Income, 3)[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "944f56d6", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "1889caca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "bd4dca31", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "70fae990", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca" - ] - }, - { - "cell_type": "markdown", - "id": "2d812694", - "metadata": {}, - "source": [ - "## Custom encoding\n", - "\n", - "Instead of PCA we might run some clustering on some features and then uses the clusters to\n", - "create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples\n", - "of this." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "8e5d2305", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import derived_variable, Contrast" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "8a40c663", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 0, 2,\n", - " 2, 2, 1, 2, 1, 0, 0, 1, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2,\n", - " 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2,\n", - " 0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 0, 1, 2,\n", - " 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 2, 0, 1, 0,\n", - " 0, 1, 0, 2, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,\n", - " 0, 0, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 0, 1, 2, 2,\n", - " 1, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 0,\n", - " 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 1, 1, 2, 2, 0, 1, 1, 2, 2,\n", - " 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 1,\n", - " 1, 2, 0, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 0, 0,\n", - " 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2,\n", - " 2, 2, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 2,\n", - " 1, 2, 2, 1, 1, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0,\n", - " 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 0, 2, 1, 0, 1,\n", - " 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1,\n", - " 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2, 1, 0, 2, 1, 1, 0, 2, 2,\n", - " 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1,\n", - " 2, 2, 0, 2], dtype=int32)" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cluster import KMeans\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None)\n", - "X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1)\n", - "cluster.fit(X.values)\n", - "cluster.predict(X.values)" - ] - }, - { - "cell_type": "markdown", - "id": "9bc38836", - "metadata": {}, - "source": [ - "For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate\n", - "features all use `transform` then the do not even need to use these two calls to `make_pipeline`." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "8ceab9b6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptmyclus
01.01
11.01
21.02
31.01
41.02
.........
3951.01
3961.02
3971.02
3981.00
3991.02
\n", - "

400 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " intercept myclus\n", - "0 1.0 1\n", - "1 1.0 1\n", - "2 1.0 2\n", - "3 1.0 1\n", - "4 1.0 2\n", - ".. ... ...\n", - "395 1.0 1\n", - "396 1.0 2\n", - "397 1.0 2\n", - "398 1.0 0\n", - "399 1.0 2\n", - "\n", - "[400 rows x 2 columns]" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n", - " name='myclus', \n", - " encoder=cluster2,\n", - " use_transform=False)\n", - "design = ModelSpec([cluster_var]).fit(Carseats)\n", - "design.transform(Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "1f9b2630", - "metadata": {}, - "source": [ - "Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a\n", - "categorical encoder." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "ffde00a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=(Variable(variables=('Income', 'Price', 'Advertising', 'Population'), name='myclus', encoder=Pipeline(steps=[('standardscaler', StandardScaler()),\n", - " ('kmeans', KMeans(n_clusters=3, random_state=0))]), use_transform=False, pure_columns=False, override_encoder_colnames=True),), name='mynewcat', encoder=Contrast(), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n", - " name='myclus', \n", - " encoder=cluster2,\n", - " use_transform=False)\n", - "cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop'))\n", - "cat_cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "5afeab7c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intercept12
01.01.00.0
11.01.00.0
21.00.01.0
31.01.00.0
41.00.01.0
............
3951.01.00.0
3961.00.01.0
3971.00.01.0
3981.00.00.0
3991.00.01.0
\n", - "

400 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " intercept 1 2\n", - "0 1.0 1.0 0.0\n", - "1 1.0 1.0 0.0\n", - "2 1.0 0.0 1.0\n", - "3 1.0 1.0 0.0\n", - "4 1.0 0.0 1.0\n", - ".. ... ... ...\n", - "395 1.0 1.0 0.0\n", - "396 1.0 0.0 1.0\n", - "397 1.0 0.0 1.0\n", - "398 1.0 0.0 0.0\n", - "399 1.0 0.0 1.0\n", - "\n", - "[400 rows x 3 columns]" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([cat_cluster]).fit(Carseats)\n", - "\n", - "design.transform(Carseats)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e24d5637-80fb-49bf-ac10-8ff68cb8bd8f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" - }, - "kernelspec": { - "display_name": "islp_test", - "language": "python", - "name": "islp_test" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/jupyterbook/models/derived.md b/docs/jupyterbook/models/derived.md deleted file mode 100644 index 1d0f23b..0000000 --- a/docs/jupyterbook/models/derived.md +++ /dev/null @@ -1,487 +0,0 @@ ---- -jupytext: - formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.1 -kernelspec: - display_name: islp_test - language: python - name: islp_test ---- - -# Building design matrices with `ModelSpec` - -Force rebuild - -```{code-cell} ipython3 -x=4 -import numpy as np, pandas as pd -%load_ext rpy2.ipython - -from ISLP import load_data -from ISLP.models import ModelSpec - -import statsmodels.api as sm -``` - -```{code-cell} ipython3 -Carseats = load_data('Carseats') -%R -i Carseats -Carseats.columns -``` - -## Let's break up income into groups - -```{code-cell} ipython3 -Carseats['OIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H']) -Carseats['OIncome'] -``` - -Let's also create an unordered version - -```{code-cell} ipython3 -Carseats['UIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H'], - ordered=False) -Carseats['UIncome'] -``` - -## A simple model - -```{code-cell} ipython3 -design = ModelSpec(['Price', 'Income']) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -Y = Carseats['Sales'] -M = sm.OLS(Y, X).fit() -M.params -``` - -## Basic procedure - -The design matrix is built by cobbling together a set of columns and possibly transforming them. -A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit` -is to inspect a dataframe for column info. The column `ShelveLoc` is categorical: - -```{code-cell} ipython3 -Carseats['ShelveLoc'] -``` - -This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods -`get_columns` and `fit_encoder`. - -```{code-cell} ipython3 -design.column_info_['ShelveLoc'] -``` - -It recognized ordinal columns as well. - -```{code-cell} ipython3 -design.column_info_['OIncome'] -``` - -```{code-cell} ipython3 -income = design.column_info_['Income'] -cols, names = income.get_columns(Carseats) -(cols[:4], names) -``` - -## Encoding a column - -In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical -variables usually are encoded by several columns, typically one less than the number of categories. -This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform -model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits -its encoder the first time data is passed to it. - -```{code-cell} ipython3 -shelve = design.column_info_['ShelveLoc'] -cols, names = shelve.get_columns(Carseats) -(cols[:4], names) -``` - -```{code-cell} ipython3 -oincome = design.column_info_['OIncome'] -oincome.get_columns(Carseats)[0][:4] -``` - -## The terms - -The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through -the `terms` argument which should be a sequence. The elements of `terms` are often -going to be strings (or tuples of strings for interactions, see below) but are converted to a -`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple. - -```{code-cell} ipython3 -design.terms -``` - -```{code-cell} ipython3 -design.terms_ -``` - -While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A -`Variable` can also create columns through the `build_columns` method of `ModelSpec` - -```{code-cell} ipython3 -price = design.terms_[0] -design.build_columns(Carseats, price) -``` - -Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The -tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then -is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during -the call to `ModelSpec.fit`. - -```{code-cell} ipython3 -from ISLP.models.model_spec import Variable - -new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None) -design.build_columns(Carseats, new_var) -``` - -Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the -arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`. - -```{code-cell} ipython3 -from sklearn.decomposition import PCA -pca = PCA(n_components=2) -pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit` -pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca) -design.build_columns(Carseats, pca_var) -``` - -The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`) -or `Variable` instances (`pca_var`). - -```{code-cell} ipython3 -fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None) -design.build_columns(Carseats, fancy_var) -``` - -We can of course run PCA again on these features (if we wanted). - -```{code-cell} ipython3 -pca2 = PCA(n_components=2) -pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit` -pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2) -design.build_columns(Carseats, pca2_var) -``` - -## Building the design matrix - -With these notions in mind, the final design is essentially then - -```{code-cell} ipython3 -X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4] -``` - -An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is -a dataframe the index is adjusted accordingly. - -```{code-cell} ipython3 -design.intercept -``` - -```{code-cell} ipython3 -design.transform(Carseats)[:4] -``` - -## Predicting - -Constructing the design matrix at any values is carried out by the `transform` method. - -```{code-cell} ipython3 -new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]}) -new_X = design.transform(new_data) -M.get_prediction(new_X).predicted_mean -``` - -```{code-cell} ipython3 -%%R -i new_data,Carseats -predict(lm(Sales ~ Price + Income, data=Carseats), new_data) -``` - -### Difference between using `pd.DataFrame` and `np.ndarray` - -If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns. - -If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so, -in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. - -```{code-cell} ipython3 -Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']]) -design_np = ModelSpec([0,3]).fit(Carseats_np) -design_np.transform(Carseats_np)[:4] -``` - -The following will fail for hopefully obvious reasons - -```{code-cell} ipython3 -try: - new_D = np.zeros((2,2)) - new_D[:,0] = [10,20] - new_D[:,1] = [40,50] - M.get_prediction(new_D).predicted_mean -except ValueError as e: - print(e) -``` - -Ultimately, `M` expects 3 columns for new predictions because it was fit -with a matrix having 3 columns (the first representing an intercept). - -We might be tempted to try as with the `pd.DataFrame` and produce -an `np.ndarray` with only the necessary variables. - -```{code-cell} ipython3 -try: - new_X = np.zeros((2,2)) - new_X[:,0] = [10,20] - new_X[:,1] = [40,50] - new_D = design_np.transform(new_X) - M.get_prediction(new_D).predicted_mean -except IndexError as e: - print(e) -``` - -This fails because `design_np` is looking for column `3` from its `terms`: - -```{code-cell} ipython3 -design_np.terms_ -``` - -However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed -represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`: - -```{code-cell} ipython3 -new_X = np.zeros((2,4)) -new_X[:,0] = [10,20] -new_X[:,3] = [40,50] -new_D = design_np.transform(new_X) -M.get_prediction(new_D).predicted_mean -``` - -Given this subtlety about needing to supply arrays with identical column structure to `transform` when -using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case. - -+++ - -## A model with some categorical variables - -Categorical variables become `Column` instances with encoders. - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats) -design.column_info_['UIncome'] -``` - -```{code-cell} ipython3 -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Getting the encoding you want - -By default the level dropped by `ModelSpec` will be the first of the `categories_` values from -`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems -as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)` -where `new_encoder` would somehow drop the column we want dropped. - -However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`: - -```{code-cell} ipython3 -design.column_info_['UIncome'] -``` - -This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when -we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest -and we need a way to allow different encodings of the same column of `Carseats` - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - - We can create a new -`Column` with the encoder we want. For categorical variables, there is a convenience function to do so. - -```{code-cell} ipython3 -from ISLP.models.model_spec import contrast -pref_encoding = contrast('UIncome', 'drop', 'L') -``` - -```{code-cell} ipython3 -design.build_columns(Carseats, pref_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Interactions - -We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument -to `ModelSpec`. - -```{code-cell} ipython3 -design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with -`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`. - -```{code-cell} ipython3 -design.terms_[0] -``` - -Comparing this to the previous `R` model. - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - -We note a few important things: - -1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these -columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!** - -2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.** - -A few notes: - -- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.** - -- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily. - -```{code-cell} ipython3 -full_encoding = contrast('UIncome', None) -design.build_columns(Carseats, full_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -## Special encodings - -For flexible models, we may want to consider transformations of features, i.e. polynomial -or spline transformations. Given transforms that follow the `fit/transform` paradigm -we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms` -package includes a `Poly` transform - -```{code-cell} ipython3 -from ISLP.models.model_spec import poly -poly('Income', 3) -``` - -```{code-cell} ipython3 -design = ModelSpec([poly('Income', 3), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -Compare: - -```{code-cell} ipython3 -%%R -lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef -``` - -## Splines - -Support for natural and B-splines is also included - -```{code-cell} ipython3 -from ISLP.models.model_spec import ns, bs, pca -``` - -## Custom encoding - -Instead of PCA we might run some clustering on some features and then uses the clusters to -create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples -of this. - -```{code-cell} ipython3 -from ISLP.models.model_spec import derived_variable, Contrast -``` - -```{code-cell} ipython3 -from sklearn.cluster import KMeans -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0)) -group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None) -X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1) -cluster.fit(X.values) -cluster.predict(X.values) -``` - -For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate -features all use `transform` then the do not even need to use these two calls to `make_pipeline`. - -```{code-cell} ipython3 -cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0)) -cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], - name='myclus', - encoder=cluster2, - use_transform=False) -design = ModelSpec([cluster_var]).fit(Carseats) -design.transform(Carseats) -``` - -Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a -categorical encoder. - -```{code-cell} ipython3 -cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0)) -cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], - name='myclus', - encoder=cluster2, - use_transform=False) -cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop')) -cat_cluster -``` - -```{code-cell} ipython3 -design = ModelSpec([cat_cluster]).fit(Carseats) - -design.transform(Carseats) -``` - -```{code-cell} ipython3 - -``` diff --git a/docs/jupyterbook/models/selection.ipynb b/docs/jupyterbook/models/selection.ipynb index b41cf6a..fd66d95 100644 --- a/docs/jupyterbook/models/selection.ipynb +++ b/docs/jupyterbook/models/selection.ipynb @@ -2,2723 +2,259 @@ "cells": [ { "cell_type": "markdown", - "id": "72bae06a", + "id": "247387ec-1477-42e6-9e69-cad1cacb5721", "metadata": {}, "source": [ - "# Model selection using `ModelSpec`" + "# Model selection using `ModelSpec`\n", + "\n", + "\n", + "In this lab we illustrate how to run forward stepwise model selection\n", + "using the model specification capability of `ModelSpec`." ] }, { "cell_type": "code", "execution_count": 1, - "id": "ae6bd850", + "id": "4720bb2a-6bec-4e91-a57e-9689aa4f0532", "metadata": {}, "outputs": [], "source": [ - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.api import OLS\n", "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5ac10e72", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "80a586d9", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "850356ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "e24def3a", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "edf83080", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "aa22bb9c", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "38d92522", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "cfc2056f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "4674c345", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5688f0ad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" + "from ISLP.models import (ModelSpec,\n", + " Stepwise,\n", + " sklearn_selected)" ] }, { "cell_type": "markdown", - "id": "4ae28ffa", + "id": "1c224240-ce8b-47f3-a85a-052c43038b26", "metadata": {}, "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." + "### Forward Selection\n", + " \n", + "We will apply the forward-selection approach to the `Hitters` \n", + "data. We wish to predict a baseball player’s `Salary` on the\n", + "basis of various statistics associated with performance in the\n", + "previous year." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "5f8926fd", + "execution_count": 2, + "id": "2adc66cc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" + "59" ] }, - "execution_count": 8, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design.column_info_['ShelveLoc']" + "Hitters = load_data('Hitters')\n", + "np.isnan(Hitters['Salary']).sum()" ] }, { "cell_type": "markdown", - "id": "966f53a5", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a137fa1e", + "id": "40c9a484", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "design.column_info_['OIncome']" + " \n", + " We see that `Salary` is missing for 59 players. The\n", + "`dropna()` method of data frames removes all of the rows that have missing\n", + "values in any variable (by default --- see `Hitters.dropna?`)." ] }, { "cell_type": "code", - "execution_count": 10, - "id": "3390dcb0", + "execution_count": 3, + "id": "1869fdab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" + "(263, 20)" ] }, - "execution_count": 10, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" + "Hitters = Hitters.dropna()\n", + "Hitters.shape" ] }, { "cell_type": "markdown", - "id": "b6667415", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a1b42dbd", + "id": "0a1fe9e6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" + "We first choose the best model using forward selection based on AIC. This score\n", + "is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use\n", + "it as a scorer. By default, `sklearn` tries to maximize a score, hence\n", + " our scoring function computes the negative AIC statistic." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "31367988", + "execution_count": 4, + "id": "76bd8110", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" + "def negAIC(estimator, X, Y):\n", + " \"Negative AIC\"\n", + " n, p = X.shape\n", + " Yhat = estimator.predict(X)\n", + " MSE = np.mean((Y - Yhat)**2)\n", + " return n + n * np.log(MSE) + 2 * (p + 1)\n", + " " ] }, { "cell_type": "markdown", - "id": "751c1487", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "6e2b6155", + "id": "14ba6f49", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "design.terms" + "We need to estimate the residual variance $\\sigma^2$, which is the first argument in our scoring function above.\n", + "We will fit the biggest model, using all the variables, and estimate $\\sigma^2$ based on its MSE." ] }, { "cell_type": "code", - "execution_count": 14, - "id": "d3e669da", + "execution_count": 5, + "id": "94e10f35", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "design.terms_" + "design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters)\n", + "Y = np.array(Hitters['Salary'])\n", + "X = design.transform(Hitters)" ] }, { "cell_type": "markdown", - "id": "fb0a45c9", + "id": "afdda5f2", "metadata": {}, "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" + "Along with a score we need to specify the search strategy. This is done through the object\n", + "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n", + "runs forward stepwise until any further additions to the model do not result\n", + "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n", + "runs a fixed number of steps of stepwise search." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "554c67cb", + "execution_count": 6, + "id": "048c8500", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" + "strategy = Stepwise.first_peak(design,\n", + " direction='forward',\n", + " max_terms=len(design.terms))" ] }, { "cell_type": "markdown", - "id": "06956a6f", + "id": "e0c0af0e", "metadata": {}, "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." + " \n", + "We now fit a linear regression model with `Salary` as outcome using forward\n", + "selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes\n", + "a model from `statsmodels` along with a search strategy and selects a model with its\n", + "`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be\n", + "selected." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "dd434884", + "execution_count": 7, + "id": "26f09fe9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" + "('Assists',\n", + " 'AtBat',\n", + " 'CAtBat',\n", + " 'CHits',\n", + " 'CHmRun',\n", + " 'CRBI',\n", + " 'CRuns',\n", + " 'CWalks',\n", + " 'Division',\n", + " 'Errors',\n", + " 'Hits',\n", + " 'HmRun',\n", + " 'League',\n", + " 'NewLeague',\n", + " 'PutOuts',\n", + " 'RBI',\n", + " 'Runs',\n", + " 'Walks',\n", + " 'Years')" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" + "hitters_MSE = sklearn_selected(OLS,\n", + " strategy)\n", + "hitters_MSE.fit(Hitters, Y)\n", + "hitters_MSE.selected_state_" ] }, { "cell_type": "markdown", - "id": "5cdb088c", + "id": "4acf4792", "metadata": {}, "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." + " Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected." ] }, { "cell_type": "code", - "execution_count": 17, - "id": "519a642e", + "execution_count": 8, + "id": "a825f4d8", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" + "('Assists', 'Errors', 'League', 'NewLeague')" ] }, - "execution_count": 17, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "403921a2", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." + "hitters_Cp = sklearn_selected(OLS,\n", + " strategy,\n", + " scoring=negAIC)\n", + "hitters_Cp.fit(Hitters, Y)\n", + "hitters_Cp.selected_state_" ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b422cde1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "53e38f57", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "6347acb6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "08b5ddb0", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a8eb3e33", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "97912337", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "72b5e629", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "8a457e3e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "8624ab8c", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "6052765e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "9158de59", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "9608bed3", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "f0b8120f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "270a02a6", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "4ffbce7e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "bc5ff62b", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "34dae1e9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "7e9da262", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "938b9430", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "083e9529", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d413a9fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "0f4b508b", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "8bcbd973", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "cf13f72e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "c1fa0a90", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b28aa313", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "aa764acc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "31876a29", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "bac2643c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "1485735d", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "3987c5d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "7a6631c9", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "83a9b94e", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "f0ffabea", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "4a5fdc64", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "ae7e3bd2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "c12ac3df", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "53bf8aef", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "47723bce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "86060622", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "d7a2ab9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "2a5e7f6b", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "bbb02036", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "89106a85", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "151f3fee", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "945ce7bc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "450b94dd", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "18d5c1c8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "46c7d911", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3, )[0] 10.036373\n", - "poly(Income, 3, )[1] -2.799156\n", - "poly(Income, 3, )[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "99bf13a1", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "7606facd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "a4931031", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "1c1bf5f3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 4.240421\n", - "ns(Income, , df=5)[0] 1.468196\n", - "ns(Income, , df=5)[1] 1.499471\n", - "ns(Income, , df=5)[2] 1.152070\n", - "ns(Income, , df=5)[3] 2.418398\n", - "ns(Income, , df=5)[4] 1.804460\n", - "ShelveLoc[Good] 4.810449\n", - "ShelveLoc[Medium] 1.881095\n", - "dtype: float64" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca\n", - "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "8c24254b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n", - " 4.240421 1.468196 1.499471 1.152070 \n", - "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n", - " 2.418398 1.804460 4.810449 1.881095 \n" - ] - } - ], - "source": [ - "%%R\n", - "library(splines)\n", - "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "f9d6c4a7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 3.495085\n", - "bs(Income, , df=7, degree=2)[0] 1.813118\n", - "bs(Income, , df=7, degree=2)[1] 0.961852\n", - "bs(Income, , df=7, degree=2)[2] 2.471545\n", - "bs(Income, , df=7, degree=2)[3] 2.158891\n", - "bs(Income, , df=7, degree=2)[4] 2.091625\n", - "bs(Income, , df=7, degree=2)[5] 2.600669\n", - "bs(Income, , df=7, degree=2)[6] 2.843108\n", - "ShelveLoc[Good] 4.804919\n", - "ShelveLoc[Medium] 1.880337\n", - "dtype: float64" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "0bf1726a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) bs(Income, df = 7, degree = 2)1 \n", - " 3.4950851 1.8131176 \n", - "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n", - " 0.9618523 2.4715450 \n", - "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n", - " 2.1588908 2.0916252 \n", - "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n", - " 2.6006694 2.8431084 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.8049190 1.8803375 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "914df4cf", - "metadata": {}, - "source": [ - "## PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "cc22e780", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.419405\n", - "pca(myvars, , n_components=2)[0] -0.001131\n", - "pca(myvars, , n_components=2)[1] -0.024217\n", - "ShelveLoc[Good] 4.816253\n", - "ShelveLoc[Medium] 1.924139\n", - "dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars'), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "de571e61", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.419405 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n", - " 0.001131 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n", - " -0.024217 \n", - " ShelveLocGood \n", - " 4.816253 \n", - " ShelveLocMedium \n", - " 1.924139 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "0a103b5a", - "metadata": {}, - "source": [ - "It is of course common to scale before running PCA." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "95ca42f5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.352159\n", - "pca(myvars, , n_components=2)[0] 0.446383\n", - "pca(myvars, , n_components=2)[1] -1.219788\n", - "ShelveLoc[Good] 4.922780\n", - "ShelveLoc[Medium] 2.005617\n", - "dtype: float64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars',\n", - " scale=True), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "0dc22e35", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.3522 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n", - " 0.4469 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n", - " -1.2213 \n", - " ShelveLocGood \n", - " 4.9228 \n", - " ShelveLocMedium \n", - " 2.0056 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "70347ee9", - "metadata": {}, - "source": [ - "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n", - "of `np.std(ddof=1)`." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "aa0c2f2e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.44694166, -1.22131519])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))" - ] - }, - { - "cell_type": "markdown", - "id": "ab05c497", - "metadata": {}, - "source": [ - "## Model selection\n", - "\n", - "Another task requiring different design matrices is model selection. Manipulating\n", - "the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`)\n", - "can clearly allow for both exhaustive and stepwise model selection." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "9505c178", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.strategy import (Stepwise, \n", - " min_max)\n", - "from ISLP.models.generic_selector import FeatureSelector" - ] - }, - { - "cell_type": "markdown", - "id": "020c2532", - "metadata": {}, - "source": [ - "### Best subsets" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "f9aba6db", - "metadata": {}, - "outputs": [], - "source": [ - "design = ModelSpec(['Price', \n", - " 'UIncome', \n", - " 'Advertising', \n", - " 'US', \n", - " 'Income',\n", - " 'ShelveLoc',\n", - " 'Education',\n", - " 'Urban']).fit(Carseats)\n", - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "91144a3d", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error')" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "ae3cb2eb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.fit(Carseats, Y)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "e63b2744", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'ShelveLoc')" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "0a774b48", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('UIncome',), ('Advertising',), ('US',), ('Income',), ('ShelveLoc',), ('Education',), ('Urban',), ('Price', 'UIncome'), ('Price', 'Advertising'), ('Price', 'US'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Price', 'Education'), ('Price', 'Urban'), ('UIncome', 'Advertising'), ('UIncome', 'US'), ('UIncome', 'Income'), ('UIncome', 'ShelveLoc'), ('UIncome', 'Education'), ('UIncome', 'Urban'), ('Advertising', 'US'), ('Advertising', 'Income'), ('Advertising', 'ShelveLoc'), ('Advertising', 'Education'), ('Advertising', 'Urban'), ('US', 'Income'), ('US', 'ShelveLoc'), ('US', 'Education'), ('US', 'Urban'), ('Income', 'ShelveLoc'), ('Income', 'Education'), ('Income', 'Urban'), ('ShelveLoc', 'Education'), ('ShelveLoc', 'Urban'), ('Education', 'Urban'), ('Price', 'UIncome', 'Advertising'), ('Price', 'UIncome', 'US'), ('Price', 'UIncome', 'Income'), ('Price', 'UIncome', 'ShelveLoc'), ('Price', 'UIncome', 'Education'), ('Price', 'UIncome', 'Urban'), ('Price', 'Advertising', 'US'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Advertising', 'Education'), ('Price', 'Advertising', 'Urban'), ('Price', 'US', 'Income'), ('Price', 'US', 'ShelveLoc'), ('Price', 'US', 'Education'), ('Price', 'US', 'Urban'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Income', 'Education'), ('Price', 'Income', 'Urban'), ('Price', 'ShelveLoc', 'Education'), ('Price', 'ShelveLoc', 'Urban'), ('Price', 'Education', 'Urban'), ('UIncome', 'Advertising', 'US'), ('UIncome', 'Advertising', 'Income'), ('UIncome', 'Advertising', 'ShelveLoc'), ('UIncome', 'Advertising', 'Education'), ('UIncome', 'Advertising', 'Urban'), ('UIncome', 'US', 'Income'), ('UIncome', 'US', 'ShelveLoc'), ('UIncome', 'US', 'Education'), ('UIncome', 'US', 'Urban'), ('UIncome', 'Income', 'ShelveLoc'), ('UIncome', 'Income', 'Education'), ('UIncome', 'Income', 'Urban'), ('UIncome', 'ShelveLoc', 'Education'), ('UIncome', 'ShelveLoc', 'Urban'), ('UIncome', 'Education', 'Urban'), ('Advertising', 'US', 'Income'), ('Advertising', 'US', 'ShelveLoc'), ('Advertising', 'US', 'Education'), ('Advertising', 'US', 'Urban'), ('Advertising', 'Income', 'ShelveLoc'), ('Advertising', 'Income', 'Education'), ('Advertising', 'Income', 'Urban'), ('Advertising', 'ShelveLoc', 'Education'), ('Advertising', 'ShelveLoc', 'Urban'), ('Advertising', 'Education', 'Urban'), ('US', 'Income', 'ShelveLoc'), ('US', 'Income', 'Education'), ('US', 'Income', 'Urban'), ('US', 'ShelveLoc', 'Education'), ('US', 'ShelveLoc', 'Urban'), ('US', 'Education', 'Urban'), ('Income', 'ShelveLoc', 'Education'), ('Income', 'ShelveLoc', 'Urban'), ('Income', 'Education', 'Urban'), ('ShelveLoc', 'Education', 'Urban')])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "0ca1f28c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'Income')" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=3,\n", - " lower_terms=['Price'],\n", - " upper_terms=['Price', 'Income', 'Advertising'])\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error')\n", - "selector.fit(Carseats, Y)\n", - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "5c6732fa", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([('Price',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'Advertising', 'Income')])" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "markdown", - "id": "7bb6fcc3", - "metadata": {}, - "source": [ - "### Stepwise selection" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "9985d0fc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Advertising', 'Income', 'Price', 'ShelveLoc')" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "strategy = Stepwise.first_peak(design,\n", - " min_terms=0,\n", - " max_terms=6,\n", - " lower_terms=['Price'],\n", - " upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US'\n", - " 'Education', 'Urban'])\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error',\n", - " cv=3)\n", - "selector.fit(Carseats, Y)\n", - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "d3cf3e9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('Price', 'UIncome'), ('Advertising', 'Price'), ('Income', 'Price'), ('Price', 'ShelveLoc'), ('Price', 'Urban'), ('Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Price', 'ShelveLoc'), ('Income', 'Price', 'ShelveLoc'), ('Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc'), ('Advertising', 'Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban')])" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "dd43ea7c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{(): -8.055847677297269,\n", - " ('Price',): -6.514630258019962,\n", - " ('Price', 'UIncome'): -6.621654905418576,\n", - " ('Advertising', 'Price'): -5.825225309857156,\n", - " ('Income', 'Price'): -6.455432795910743,\n", - " ('Price', 'ShelveLoc'): -3.780183168075897,\n", - " ('Price', 'Urban'): -6.5430157266926114,\n", - " ('Price', 'ShelveLoc', 'UIncome'): -3.6938729706475004,\n", - " ('Advertising', 'Price', 'ShelveLoc'): -3.2067316025050645,\n", - " ('Income', 'Price', 'ShelveLoc'): -3.634698914456587,\n", - " ('Price', 'ShelveLoc', 'Urban'): -3.776148947585277,\n", - " ('Advertising', 'Price', 'ShelveLoc', 'UIncome'): -3.1240961493998642,\n", - " ('Advertising', 'Income', 'Price', 'ShelveLoc'): -3.0801704971796244,\n", - " ('Advertising', 'Price', 'ShelveLoc', 'Urban'): -3.207569489139369,\n", - " ('Advertising',\n", - " 'Income',\n", - " 'Price',\n", - " 'ShelveLoc',\n", - " 'UIncome'): -3.1048826894036115,\n", - " ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban'): -3.0867130108677423}" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "7c026f0a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Advertising', 'Income', 'Price', 'ShelveLoc')" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "markdown", - "id": "b4b89d04", - "metadata": {}, - "source": [ - "### Enforcing constraints\n", - "\n", - "In models with interactions, we may often want to impose constraints on interactions and main effects.\n", - "This can be achieved here by use of a `validator` that checks whether a given model is valid.\n", - "\n", - "Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless\n", - "`Price` is in the following model." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "1c1e31d0", - "metadata": {}, - "outputs": [], - "source": [ - "design = ModelSpec(['Price', \n", - " 'Advertising', \n", - " 'Income',\n", - " 'ShelveLoc']).fit(Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "be929807", - "metadata": {}, - "source": [ - "The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not\n", - "be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state.\n", - "\n", - "Both `min_max_strategy` and `step_strategy` accept a `validator` argument." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "c075b1b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('Advertising',), ('Income',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Advertising', 'Income'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Advertising', 'Income', 'ShelveLoc')])" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.strategy import validator_from_constraints\n", - "constraints = np.zeros((4, 4))\n", - "constraints[0,3] = 1\n", - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=4,\n", - " validator=validator_from_constraints(design,\n", - " constraints))\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error',\n", - " cv=3)\n", - "selector.fit(Carseats, Y)\n", - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "3472d47c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'Income', 'ShelveLoc')" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "5d2c82b9", - "metadata": {}, - "outputs": [], - "source": [ - "Hitters=load_data('Hitters')" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "4b2ac2c2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n", - " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n", - " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n", - " dtype='object')" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hitters.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "bd2ad0dd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('AtBat',), ('Hits',), ('HmRun',), ('Runs',), ('RBI',), ('Walks',), ('Years',), ('CAtBat',), ('CHits',), ('CHmRun',), ('CRuns',), ('CRBI',), ('CWalks',), ('League',), ('Division',), ('PutOuts',), ('Assists',), ('Errors',), ('NewLeague',), ('AtBat', 'CRBI'), ('CRBI', 'Hits'), ('CRBI', 'HmRun'), ('CRBI', 'Runs'), ('CRBI', 'RBI'), ('CRBI', 'Walks'), ('CRBI', 'Years'), ('CAtBat', 'CRBI'), ('CHits', 'CRBI'), ('CHmRun', 'CRBI'), ('CRBI', 'CRuns'), ('CRBI', 'CWalks'), ('CRBI', 'League'), ('CRBI', 'Division'), ('CRBI', 'PutOuts'), ('Assists', 'CRBI'), ('CRBI', 'Errors'), ('CRBI', 'NewLeague'), ('AtBat', 'CRBI', 'Hits'), ('CRBI', 'Hits', 'HmRun'), ('CRBI', 'Hits', 'Runs'), ('CRBI', 'Hits', 'RBI'), ('CRBI', 'Hits', 'Walks'), ('CRBI', 'Hits', 'Years'), ('CAtBat', 'CRBI', 'Hits'), ('CHits', 'CRBI', 'Hits'), ('CHmRun', 'CRBI', 'Hits'), ('CRBI', 'CRuns', 'Hits'), ('CRBI', 'CWalks', 'Hits'), ('CRBI', 'Hits', 'League'), ('CRBI', 'Division', 'Hits'), ('CRBI', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits'), ('CRBI', 'Errors', 'Hits'), ('CRBI', 'Hits', 'NewLeague'), ('AtBat', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'League', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years')])" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hitters = Hitters.dropna()\n", - "Y=Hitters['Salary']\n", - "X=Hitters.drop('Salary', axis=1)\n", - "design = ModelSpec(X.columns).fit(X)\n", - "strategy = Stepwise.first_peak(design,\n", - " direction='forward',\n", - " min_terms=0,\n", - " max_terms=19)\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error', cv=None)\n", - "selector.fit(X, Y)\n", - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "31788748", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "19" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(selector.selected_state_)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "e97d80c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "19" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(X.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a71f0332", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Start: AIC=3215.77\n", - "Salary ~ 1\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CRBI 1 17139434 36179679 3115.8\n", - "+ CRuns 1 16881162 36437951 3117.6\n", - "+ CHits 1 16065140 37253973 3123.5\n", - "+ CAtBat 1 14759710 38559403 3132.5\n", - "+ CHmRun 1 14692193 38626920 3133.0\n", - "+ CWalks 1 12792622 40526491 3145.6\n", - "+ RBI 1 10771083 42548030 3158.4\n", - "+ Walks 1 10504833 42814280 3160.1\n", - "+ Hits 1 10260491 43058621 3161.6\n", - "+ Runs 1 9399158 43919955 3166.8\n", - "+ Years 1 8559105 44760007 3171.7\n", - "+ AtBat 1 8309469 45009644 3173.2\n", - "+ HmRun 1 6273967 47045145 3184.8\n", - "+ PutOuts 1 4814100 48505013 3192.9\n", - "+ Division 1 1976102 51343011 3207.8\n", - " 53319113 3215.8\n", - "+ Assists 1 34497 53284615 3217.6\n", - "+ League 1 10876 53308237 3217.7\n", - "+ Errors 1 1555 53317558 3217.8\n", - "+ NewLeague 1 428 53318684 3217.8\n", - "\n", - "Step: AIC=3115.78\n", - "Salary ~ CRBI\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Hits 1 5533119 30646560 3074.1\n", - "+ Runs 1 5176532 31003147 3077.2\n", - "+ Walks 1 4199733 31979946 3085.3\n", - "+ AtBat 1 4064585 32115095 3086.4\n", - "+ RBI 1 3308272 32871407 3092.6\n", - "+ PutOuts 1 3267035 32912644 3092.9\n", - "+ Division 1 1733887 34445793 3104.9\n", - "+ Years 1 1667339 34512340 3105.4\n", - "+ HmRun 1 1271587 34908092 3108.4\n", - "+ CRuns 1 354561 35825119 3115.2\n", - "+ Assists 1 346020 35833659 3115.2\n", - " 36179679 3115.8\n", - "+ Errors 1 194403 35985276 3116.4\n", - "+ CAtBat 1 92261 36087418 3117.1\n", - "+ CHits 1 75469 36104210 3117.2\n", - "+ CWalks 1 51974 36127705 3117.4\n", - "+ NewLeague 1 17778 36161901 3117.7\n", - "+ League 1 11825 36167855 3117.7\n", - "+ CHmRun 1 515 36179165 3117.8\n", - "\n", - "Step: AIC=3074.13\n", - "Salary ~ CRBI + Hits\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ PutOuts 1 1397263 29249297 3063.8\n", - "+ Division 1 1279275 29367285 3064.9\n", - "+ AtBat 1 821767 29824793 3069.0\n", - "+ Walks 1 781767 29864793 3069.3\n", - "+ Years 1 254910 30391650 3073.9\n", - " 30646560 3074.1\n", - "+ League 1 208880 30437680 3074.3\n", - "+ CRuns 1 132614 30513946 3075.0\n", - "+ NewLeague 1 118474 30528086 3075.1\n", - "+ Runs 1 114198 30532362 3075.1\n", - "+ Errors 1 99776 30546784 3075.3\n", - "+ CAtBat 1 83517 30563043 3075.4\n", - "+ Assists 1 44781 30601779 3075.7\n", - "+ CWalks 1 23668 30622892 3075.9\n", - "+ CHmRun 1 4790 30641769 3076.1\n", - "+ CHits 1 4358 30642202 3076.1\n", - "+ HmRun 1 2173 30644387 3076.1\n", - "+ RBI 1 1137 30645423 3076.1\n", - "\n", - "Step: AIC=3063.85\n", - "Salary ~ CRBI + Hits + PutOuts\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Division 1 1278445 27970852 3054.1\n", - "+ AtBat 1 1009933 28239364 3056.6\n", - "+ Walks 1 539490 28709807 3061.0\n", - "+ CRuns 1 273649 28975648 3063.4\n", - " 29249297 3063.8\n", - "+ Years 1 136906 29112391 3064.6\n", - "+ League 1 122841 29126456 3064.8\n", - "+ Runs 1 117930 29131367 3064.8\n", - "+ Errors 1 97244 29152053 3065.0\n", - "+ NewLeague 1 57839 29191458 3065.3\n", - "+ CHits 1 35096 29214201 3065.5\n", - "+ RBI 1 33965 29215331 3065.6\n", - "+ HmRun 1 31227 29218070 3065.6\n", - "+ CWalks 1 28572 29220725 3065.6\n", - "+ CAtBat 1 20518 29228779 3065.7\n", - "+ Assists 1 1681 29247616 3065.8\n", - "+ CHmRun 1 1419 29247878 3065.8\n", - "\n", - "Step: AIC=3054.1\n", - "Salary ~ CRBI + Hits + PutOuts + Division\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ AtBat 1 820952 27149899 3048.3\n", - "+ Walks 1 491584 27479268 3051.4\n", - " 27970852 3054.1\n", - "+ CRuns 1 193604 27777248 3054.3\n", - "+ Years 1 166845 27804007 3054.5\n", - "+ League 1 110628 27860224 3055.1\n", - "+ Errors 1 81385 27889467 3055.3\n", - "+ Runs 1 65921 27904931 3055.5\n", - "+ RBI 1 53719 27917133 3055.6\n", - "+ NewLeague 1 52275 27918577 3055.6\n", - "+ CHits 1 33863 27936989 3055.8\n", - "+ HmRun 1 26390 27944462 3055.8\n", - "+ CAtBat 1 18751 27952101 3055.9\n", - "+ CWalks 1 5723 27965129 3056.0\n", - "+ Assists 1 1036 27969816 3056.1\n", - "+ CHmRun 1 165 27970687 3056.1\n", - "\n", - "Step: AIC=3048.26\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Walks 1 954996 26194904 3040.8\n", - "+ Years 1 253362 26896537 3047.8\n", - "+ Runs 1 208743 26941157 3048.2\n", - " 27149899 3048.3\n", - "+ CRuns 1 185825 26964075 3048.5\n", - "+ League 1 95986 27053913 3049.3\n", - "+ NewLeague 1 52693 27097206 3049.8\n", - "+ CHmRun 1 43173 27106726 3049.8\n", - "+ Assists 1 28898 27121001 3050.0\n", - "+ CAtBat 1 20989 27128910 3050.1\n", - "+ CWalks 1 15599 27134301 3050.1\n", - "+ Errors 1 6265 27143634 3050.2\n", - "+ CHits 1 5305 27144594 3050.2\n", - "+ RBI 1 1236 27148663 3050.2\n", - "+ HmRun 1 11 27149888 3050.3\n", - "\n", - "Step: AIC=3040.85\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CWalks 1 240687 25954217 3040.4\n", - " 26194904 3040.8\n", - "+ Years 1 184508 26010396 3041.0\n", - "+ CRuns 1 110695 26084209 3041.7\n", - "+ League 1 77974 26116930 3042.1\n", - "+ Assists 1 75782 26119122 3042.1\n", - "+ NewLeague 1 40909 26153995 3042.4\n", - "+ CHits 1 37304 26157599 3042.5\n", - "+ RBI 1 11728 26183176 3042.7\n", - "+ HmRun 1 4747 26190157 3042.8\n", - "+ Errors 1 2727 26192177 3042.8\n", - "+ CAtBat 1 2630 26192274 3042.8\n", - "+ CHmRun 1 943 26193961 3042.8\n", - "+ Runs 1 37 26194867 3042.8\n", - "\n", - "Step: AIC=3040.42\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks + CWalks\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CRuns 1 794983 25159234 3034.2\n", - "+ CHits 1 273728 25680489 3039.6\n", - " 25954217 3040.4\n", - "+ Assists 1 138506 25815711 3041.0\n", - "+ CAtBat 1 89289 25864929 3041.5\n", - "+ RBI 1 86941 25867276 3041.5\n", - "+ League 1 77159 25877058 3041.6\n", - "+ Years 1 70126 25884091 3041.7\n", - "+ NewLeague 1 37807 25916410 3042.0\n", - "+ HmRun 1 33601 25920616 3042.1\n", - "+ CHmRun 1 9034 25945183 3042.3\n", - "+ Errors 1 6928" - ] - } - ], - "source": [ - "%%R -i Hitters\n", - "step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6117f650", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "536a8bc3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bddc13c5", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -2726,9 +262,9 @@ "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/jupyterbook/models/selection.md b/docs/jupyterbook/models/selection.md index c868c75..949ccc1 100644 --- a/docs/jupyterbook/models/selection.md +++ b/docs/jupyterbook/models/selection.md @@ -5,670 +5,107 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Model selection using `ModelSpec` -```{code-cell} ipython3 -import numpy as np, pandas as pd -%load_ext rpy2.ipython - -from ISLP import load_data -from ISLP.models import ModelSpec - -import statsmodels.api as sm -``` - -```{code-cell} ipython3 -Carseats = load_data('Carseats') -%R -i Carseats -Carseats.columns -``` - -## Let's break up income into groups - -```{code-cell} ipython3 -Carseats['OIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H']) -Carseats['OIncome'] -``` - -Let's also create an unordered version - -```{code-cell} ipython3 -Carseats['UIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H'], - ordered=False) -Carseats['UIncome'] -``` - -## A simple model - -```{code-cell} ipython3 -design = ModelSpec(['Price', 'Income']) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -Y = Carseats['Sales'] -M = sm.OLS(Y, X).fit() -M.params -``` - -## Basic procedure - -The design matrix is built by cobbling together a set of columns and possibly transforming them. -A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit` -is to inspect a dataframe for column info. The column `ShelveLoc` is categorical: - -```{code-cell} ipython3 -Carseats['ShelveLoc'] -``` - -This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods -`get_columns` and `fit_encoder`. - -```{code-cell} ipython3 -design.column_info_['ShelveLoc'] -``` - -It recognized ordinal columns as well. - -```{code-cell} ipython3 -design.column_info_['OIncome'] -``` - -```{code-cell} ipython3 -income = design.column_info_['Income'] -cols, names = income.get_columns(Carseats) -(cols[:4], names) -``` - -## Encoding a column - -In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical -variables usually are encoded by several columns, typically one less than the number of categories. -This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform -model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits -its encoder the first time data is passed to it. - -```{code-cell} ipython3 -shelve = design.column_info_['ShelveLoc'] -cols, names = shelve.get_columns(Carseats) -(cols[:4], names) -``` - -```{code-cell} ipython3 -oincome = design.column_info_['OIncome'] -oincome.get_columns(Carseats)[0][:4] -``` - -## The terms - -The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through -the `terms` argument which should be a sequence. The elements of `terms` are often -going to be strings (or tuples of strings for interactions, see below) but are converted to a -`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple. - -```{code-cell} ipython3 -design.terms -``` - -```{code-cell} ipython3 -design.terms_ -``` - -While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A -`Variable` can also create columns through the `build_columns` method of `ModelSpec` - -```{code-cell} ipython3 -price = design.terms_[0] -design.build_columns(Carseats, price) -``` - -Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The -tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then -is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during -the call to `ModelSpec.fit`. - -```{code-cell} ipython3 -from ISLP.models.model_spec import Variable - -new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None) -design.build_columns(Carseats, new_var) -``` - -Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the -arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`. - -```{code-cell} ipython3 -from sklearn.decomposition import PCA -pca = PCA(n_components=2) -pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit` -pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca) -design.build_columns(Carseats, pca_var) -``` - -The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`) -or `Variable` instances (`pca_var`). - -```{code-cell} ipython3 -fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None) -design.build_columns(Carseats, fancy_var) -``` - -We can of course run PCA again on these features (if we wanted). - -```{code-cell} ipython3 -pca2 = PCA(n_components=2) -pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit` -pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2) -design.build_columns(Carseats, pca2_var) -``` - -## Building the design matrix - -With these notions in mind, the final design is essentially then - -```{code-cell} ipython3 -X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4] -``` - -An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is -a dataframe the index is adjusted accordingly. - -```{code-cell} ipython3 -design.intercept -``` - -```{code-cell} ipython3 -design.transform(Carseats)[:4] -``` - -## Predicting - -Constructing the design matrix at any values is carried out by the `transform` method. - -```{code-cell} ipython3 -new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]}) -new_X = design.transform(new_data) -M.get_prediction(new_X).predicted_mean -``` - -```{code-cell} ipython3 -%%R -i new_data,Carseats -predict(lm(Sales ~ Price + Income, data=Carseats), new_data) -``` - -### Difference between using `pd.DataFrame` and `np.ndarray` - -If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns. - -If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so, -in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. - -```{code-cell} ipython3 -Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']]) -design_np = ModelSpec([0,3]).fit(Carseats_np) -design_np.transform(Carseats_np)[:4] -``` - -The following will fail for hopefully obvious reasons - -```{code-cell} ipython3 -try: - new_D = np.zeros((2,2)) - new_D[:,0] = [10,20] - new_D[:,1] = [40,50] - M.get_prediction(new_D).predicted_mean -except ValueError as e: - print(e) -``` - -Ultimately, `M` expects 3 columns for new predictions because it was fit -with a matrix having 3 columns (the first representing an intercept). - -We might be tempted to try as with the `pd.DataFrame` and produce -an `np.ndarray` with only the necessary variables. - -```{code-cell} ipython3 -try: - new_X = np.zeros((2,2)) - new_X[:,0] = [10,20] - new_X[:,1] = [40,50] - new_D = design_np.transform(new_X) - M.get_prediction(new_D).predicted_mean -except IndexError as e: - print(e) -``` - -This fails because `design_np` is looking for column `3` from its `terms`: - -```{code-cell} ipython3 -design_np.terms_ -``` - -However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed -represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`: - -```{code-cell} ipython3 -new_X = np.zeros((2,4)) -new_X[:,0] = [10,20] -new_X[:,3] = [40,50] -new_D = design_np.transform(new_X) -M.get_prediction(new_D).predicted_mean -``` - -Given this subtlety about needing to supply arrays with identical column structure to `transform` when -using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case. - -+++ - -## A model with some categorical variables - -Categorical variables become `Column` instances with encoders. - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats) -design.column_info_['UIncome'] -``` - -```{code-cell} ipython3 -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Getting the encoding you want - -By default the level dropped by `ModelSpec` will be the first of the `categories_` values from -`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems -as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)` -where `new_encoder` would somehow drop the column we want dropped. - -However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`: - -```{code-cell} ipython3 -design.column_info_['UIncome'] -``` - -This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when -we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest -and we need a way to allow different encodings of the same column of `Carseats` - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - - We can create a new -`Column` with the encoder we want. For categorical variables, there is a convenience function to do so. - -```{code-cell} ipython3 -from ISLP.models.model_spec import contrast -pref_encoding = contrast('UIncome', 'drop', 'L') -``` - -```{code-cell} ipython3 -design.build_columns(Carseats, pref_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Interactions - -We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument -to `ModelSpec`. - -```{code-cell} ipython3 -design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with -`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`. - -```{code-cell} ipython3 -design.terms_[0] -``` - -Comparing this to the previous `R` model. - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - -We note a few important things: - -1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these -columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!** - -2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.** - -A few notes: - -- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.** -- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily. - -```{code-cell} ipython3 -full_encoding = contrast('UIncome', None) -design.build_columns(Carseats, full_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -## Special encodings - -For flexible models, we may want to consider transformations of features, i.e. polynomial -or spline transformations. Given transforms that follow the `fit/transform` paradigm -we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms` -package includes a `Poly` transform - -```{code-cell} ipython3 -from ISLP.models.model_spec import poly -poly('Income', 3) -``` - -```{code-cell} ipython3 -design = ModelSpec([poly('Income', 3), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -Compare: - -```{code-cell} ipython3 -%%R -lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef -``` - -## Splines - -Support for natural and B-splines is also included - -```{code-cell} ipython3 -from ISLP.models.model_spec import ns, bs, pca -design = ModelSpec([ns('Income', df=5), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -library(splines) -lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef -``` - -```{code-cell} ipython3 -design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef -``` - -## PCA - -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars'), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` +In this lab we illustrate how to run forward stepwise model selection +using the model specification capability of `ModelSpec`. ```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats) -``` - -It is of course common to scale before running PCA. - -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars', - scale=True), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats) -``` - -There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead -of `np.std(ddof=1)`. - -```{code-cell} ipython3 -np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1)) -``` - -## Model selection - -Another task requiring different design matrices is model selection. Manipulating -the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`) -can clearly allow for both exhaustive and stepwise model selection. - -```{code-cell} ipython3 -from ISLP.models.strategy import (Stepwise, - min_max) -from ISLP.models.generic_selector import FeatureSelector -``` - -### Best subsets - -```{code-cell} ipython3 -design = ModelSpec(['Price', - 'UIncome', - 'Advertising', - 'US', - 'Income', - 'ShelveLoc', - 'Education', - 'Urban']).fit(Carseats) -strategy = min_max(design, - min_terms=0, - max_terms=3) -``` - -```{code-cell} ipython3 -from sklearn.linear_model import LinearRegression -selector = FeatureSelector(LinearRegression(fit_intercept=False), - strategy, - scoring='neg_mean_squared_error') -``` - -```{code-cell} ipython3 -selector.fit(Carseats, Y) -``` - -```{code-cell} ipython3 -selector.selected_state_ -``` - -```{code-cell} ipython3 -selector.results_.keys() -``` - -```{code-cell} ipython3 -strategy = min_max(design, - min_terms=0, - max_terms=3, - lower_terms=['Price'], - upper_terms=['Price', 'Income', 'Advertising']) -selector = FeatureSelector(LinearRegression(fit_intercept=False), - strategy, - scoring='neg_mean_squared_error') -selector.fit(Carseats, Y) -selector.selected_state_ -``` - -```{code-cell} ipython3 -selector.results_.keys() +import numpy as np +import pandas as pd +from statsmodels.api import OLS +from ISLP import load_data +from ISLP.models import (ModelSpec, + Stepwise, + sklearn_selected) ``` -### Stepwise selection +### Forward Selection + +We will apply the forward-selection approach to the `Hitters` +data. We wish to predict a baseball player’s `Salary` on the +basis of various statistics associated with performance in the +previous year. ```{code-cell} ipython3 -strategy = Stepwise.first_peak(design, - min_terms=0, - max_terms=6, - lower_terms=['Price'], - upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US' - 'Education', 'Urban']) -selector = FeatureSelector(LinearRegression(fit_intercept=False), - strategy, - scoring='neg_mean_squared_error', - cv=3) -selector.fit(Carseats, Y) -selector.selected_state_ +Hitters = load_data('Hitters') +np.isnan(Hitters['Salary']).sum() ``` -```{code-cell} ipython3 -selector.results_.keys() -``` - -```{code-cell} ipython3 -selector.results_ -``` + + We see that `Salary` is missing for 59 players. The +`dropna()` method of data frames removes all of the rows that have missing +values in any variable (by default --- see `Hitters.dropna?`). ```{code-cell} ipython3 -selector.selected_state_ +Hitters = Hitters.dropna() +Hitters.shape ``` -### Enforcing constraints - -In models with interactions, we may often want to impose constraints on interactions and main effects. -This can be achieved here by use of a `validator` that checks whether a given model is valid. - -Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless -`Price` is in the following model. +We first choose the best model using forward selection based on AIC. This score +is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use +it as a scorer. By default, `sklearn` tries to maximize a score, hence + our scoring function computes the negative AIC statistic. ```{code-cell} ipython3 -design = ModelSpec(['Price', - 'Advertising', - 'Income', - 'ShelveLoc']).fit(Carseats) +def negAIC(estimator, X, Y): + "Negative AIC" + n, p = X.shape + Yhat = estimator.predict(X) + MSE = np.mean((Y - Yhat)**2) + return n + n * np.log(MSE) + 2 * (p + 1) + ``` -The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not -be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state. - -Both `min_max_strategy` and `step_strategy` accept a `validator` argument. +We need to estimate the residual variance $\sigma^2$, which is the first argument in our scoring function above. +We will fit the biggest model, using all the variables, and estimate $\sigma^2$ based on its MSE. ```{code-cell} ipython3 -from ISLP.models.strategy import validator_from_constraints -constraints = np.zeros((4, 4)) -constraints[0,3] = 1 -strategy = min_max(design, - min_terms=0, - max_terms=4, - validator=validator_from_constraints(design, - constraints)) -selector = FeatureSelector(LinearRegression(fit_intercept=False), - strategy, - scoring='neg_mean_squared_error', - cv=3) -selector.fit(Carseats, Y) -selector.results_.keys() +design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters) +Y = np.array(Hitters['Salary']) +X = design.transform(Hitters) ``` -```{code-cell} ipython3 -selector.selected_state_ -``` - -```{code-cell} ipython3 -Hitters=load_data('Hitters') -``` +Along with a score we need to specify the search strategy. This is done through the object +`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()` +runs forward stepwise until any further additions to the model do not result +in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()` +runs a fixed number of steps of stepwise search. ```{code-cell} ipython3 -Hitters.columns -``` - -```{code-cell} ipython3 -Hitters = Hitters.dropna() -Y=Hitters['Salary'] -X=Hitters.drop('Salary', axis=1) -design = ModelSpec(X.columns).fit(X) strategy = Stepwise.first_peak(design, direction='forward', - min_terms=0, - max_terms=19) -selector = FeatureSelector(LinearRegression(fit_intercept=False), - strategy, - scoring='neg_mean_squared_error', cv=None) -selector.fit(X, Y) -selector.results_.keys() + max_terms=len(design.terms)) ``` -```{code-cell} ipython3 -len(selector.selected_state_) -``` + +We now fit a linear regression model with `Salary` as outcome using forward +selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes +a model from `statsmodels` along with a search strategy and selects a model with its +`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be +selected. ```{code-cell} ipython3 -len(X.columns) +hitters_MSE = sklearn_selected(OLS, + strategy) +hitters_MSE.fit(Hitters, Y) +hitters_MSE.selected_state_ ``` -```{code-cell} ipython3 -%%R -i Hitters -step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE) -``` + Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected. ```{code-cell} ipython3 - -``` - -```{code-cell} ipython3 - -``` - -```{code-cell} ipython3 - +hitters_Cp = sklearn_selected(OLS, + strategy, + scoring=negAIC) +hitters_Cp.fit(Hitters, Y) +hitters_Cp.selected_state_ ``` diff --git a/docs/jupyterbook/models/spec.ipynb b/docs/jupyterbook/models/spec.ipynb index b60e402..fce6b32 100644 --- a/docs/jupyterbook/models/spec.ipynb +++ b/docs/jupyterbook/models/spec.ipynb @@ -7,7 +7,14 @@ "source": [ "# Building design matrices with `ModelSpec`\n", "\n", - "Force rebuild" + "The `ISLP` package provides a facility to build design\n", + "matrices for regression and classification tasks. It provides similar functionality to the formula\n", + "notation of `R` though uses python objects rather than specification through the special formula syntax.\n", + "\n", + "Related tools include `patsy` and `ColumnTransformer` from `sklearn.compose`. \n", + "\n", + "Perhaps the most common use is to extract some columns from a `pd.DataFrame` and \n", + "produce a design matrix, optionally with an intercept." ] }, { @@ -17,12 +24,15 @@ "metadata": {}, "outputs": [], "source": [ - "x=4\n", - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", + "import pandas as pd\n", + "import numpy as np\n", "\n", "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", + "from ISLP.models import (ModelSpec,\n", + " summarize,\n", + " Column,\n", + " Feature,\n", + " build_columns)\n", "\n", "import statsmodels.api as sm" ] @@ -48,40 +58,42 @@ ], "source": [ "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", "Carseats.columns" ] }, { "cell_type": "markdown", - "id": "excellent-hamilton", + "id": "b7a2e6ab-491d-4a57-8184-a9fcccb2047b", "metadata": {}, "source": [ - "## Let's break up income into groups" + "We'll first build a design matrix that we can use to model `Sales`\n", + "in terms of the categorical variable `ShelveLoc` and `Price`.\n", + "\n", + "We see first that `ShelveLoc` is a categorical variable:" ] }, { "cell_type": "code", "execution_count": 3, - "id": "going-administrator", + "id": "7d3642a6-90c6-48ad-8d35-88231b4991f8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" + "0 Bad\n", + "1 Good\n", + "2 Medium\n", + "3 Medium\n", + "4 Bad\n", + " ... \n", + "395 Good\n", + "396 Medium\n", + "397 Medium\n", + "398 Bad\n", + "399 Good\n", + "Name: ShelveLoc, Length: 400, dtype: category\n", + "Categories (3, object): ['Bad', 'Good', 'Medium']" ] }, "execution_count": 3, @@ -90,42 +102,142 @@ } ], "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" + "Carseats['ShelveLoc']" ] }, { "cell_type": "markdown", - "id": "warming-mobile", + "id": "4afa201d-4b19-4d85-9e1b-1392a54d027b", "metadata": {}, "source": [ - "Let's also create an unordered version" + "This is recognized by `ModelSpec` and only 2 columns are added for the three levels. The\n", + "default behavior is to drop the first level of the categories. Later, \n", + "we will show other contrasts of the 3 columns can be produced. \n", + "\n", + "This simple example below illustrates how the first argument (its `terms`) is\n", + "used to construct a design matrix." ] }, { "cell_type": "code", "execution_count": 4, - "id": "varying-fourth", + "id": "fd5528fe-11da-4e10-8996-06085896c1a0", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interceptShelveLoc[Good]ShelveLoc[Medium]Price
01.00.00.0120
11.01.00.083
21.00.01.080
31.00.01.097
41.00.00.0128
51.00.00.072
61.00.01.0108
71.01.00.0120
81.00.01.0124
91.00.01.0124
\n", + "
" + ], "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" + " intercept ShelveLoc[Good] ShelveLoc[Medium] Price\n", + "0 1.0 0.0 0.0 120\n", + "1 1.0 1.0 0.0 83\n", + "2 1.0 0.0 1.0 80\n", + "3 1.0 0.0 1.0 97\n", + "4 1.0 0.0 0.0 128\n", + "5 1.0 0.0 0.0 72\n", + "6 1.0 0.0 1.0 108\n", + "7 1.0 1.0 0.0 120\n", + "8 1.0 0.0 1.0 124\n", + "9 1.0 0.0 1.0 124" ] }, "execution_count": 4, @@ -134,31 +246,129 @@ } ], "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" + "MS = ModelSpec(['ShelveLoc', 'Price'])\n", + "X = MS.fit_transform(Carseats)\n", + "X.iloc[:10]" ] }, { "cell_type": "markdown", - "id": "utility-viking", + "id": "6948e1ef-3685-4840-a4f2-ef15a1bcfb69", "metadata": {}, "source": [ - "## A simple model" + "We note that a column has been added for the intercept by default. This can be changed using the\n", + "`intercept` argument." ] }, { "cell_type": "code", "execution_count": 5, - "id": "unlikely-begin", + "id": "682d4c81-eba9-467d-a176-911a0269a21d", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ShelveLoc[Good]ShelveLoc[Medium]Price
00.00.0120
11.00.083
20.01.080
30.01.097
40.00.0128
50.00.072
60.01.0108
71.00.0120
80.01.0124
90.01.0124
\n", + "
" + ], "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" + " ShelveLoc[Good] ShelveLoc[Medium] Price\n", + "0 0.0 0.0 120\n", + "1 1.0 0.0 83\n", + "2 0.0 1.0 80\n", + "3 0.0 1.0 97\n", + "4 0.0 0.0 128\n", + "5 0.0 0.0 72\n", + "6 0.0 1.0 108\n", + "7 1.0 0.0 120\n", + "8 0.0 1.0 124\n", + "9 0.0 1.0 124" ] }, "execution_count": 5, @@ -167,24 +377,143 @@ } ], "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" + "MS_no1 = ModelSpec(['ShelveLoc', 'Price'], intercept=False)\n", + "MS_no1.fit_transform(Carseats)[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "54d8fd20-d8f5-44d6-9965-83e745680798", + "metadata": {}, + "source": [ + "We see that `ShelveLoc` still only contributes\n", + "two columns to the design. The `ModelSpec` object does no introspection of its arguments to effectively include an intercept term\n", + "in the column space of the design matrix.\n", + "\n", + "To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of\n", + "`R`, we call this a `Contrast` of the categorical variable." ] }, { "cell_type": "code", "execution_count": 6, - "id": "driven-employee", + "id": "555734bb-2682-4721-a1cd-6fb207394b0e", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ShelveLoc[Bad]ShelveLoc[Good]ShelveLoc[Medium]Price
01.00.00.0120
10.01.00.083
20.00.01.080
30.00.01.097
41.00.00.0128
51.00.00.072
60.00.01.0108
70.01.00.0120
80.00.01.0124
90.00.01.0124
\n", + "
" + ], "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" + " ShelveLoc[Bad] ShelveLoc[Good] ShelveLoc[Medium] Price\n", + "0 1.0 0.0 0.0 120\n", + "1 0.0 1.0 0.0 83\n", + "2 0.0 0.0 1.0 80\n", + "3 0.0 0.0 1.0 97\n", + "4 1.0 0.0 0.0 128\n", + "5 1.0 0.0 0.0 72\n", + "6 0.0 0.0 1.0 108\n", + "7 0.0 1.0 0.0 120\n", + "8 0.0 0.0 1.0 124\n", + "9 0.0 0.0 1.0 124" ] }, "execution_count": 6, @@ -193,45 +522,32 @@ } ], "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" + "from ISLP.models import contrast\n", + "shelve = contrast('ShelveLoc', None)\n", + "MS_contr = ModelSpec([shelve, 'Price'], intercept=False)\n", + "MS_contr.fit_transform(Carseats)[:10]" ] }, { "cell_type": "markdown", - "id": "secondary-winner", + "id": "66db03cf-489c-40b6-8fac-762d66cf9932", "metadata": {}, "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" + "This example above illustrates that columns need not be identified by name in `terms`. The basic\n", + "role of an item in the `terms` sequence is a description of how to extract a column\n", + "from a columnar data object, usually a `pd.DataFrame`." ] }, { "cell_type": "code", "execution_count": 7, - "id": "bored-making", + "id": "852ee40e-05d2-4785-ab7d-968fb087f3c0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" + "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=(), encoder=Contrast(method=None))" ] }, "execution_count": 7, @@ -240,28 +556,36 @@ } ], "source": [ - "Carseats['ShelveLoc']" + "shelve" ] }, { "cell_type": "markdown", - "id": "phantom-assurance", + "id": "b3be8808-1dbf-4154-882b-f61656a2ed4e", "metadata": {}, "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." + "The `Column` object can be used to directly extract relevant columns from a `pd.DataFrame`. If the `encoder` field is not\n", + "`None`, then the extracted columns will be passed through `encoder`.\n", + "The `get_columns` method produces these columns as well as names for the columns." ] }, { "cell_type": "code", "execution_count": 8, - "id": "blind-harvest", + "id": "0ebadfc0-0ea2-4abc-aac6-ef78be227ce1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" + "(array([[1., 0., 0.],\n", + " [0., 1., 0.],\n", + " [0., 0., 1.],\n", + " ...,\n", + " [0., 0., 1.],\n", + " [1., 0., 0.],\n", + " [0., 1., 0.]]),\n", + " ['ShelveLoc[Bad]', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'])" ] }, "execution_count": 8, @@ -270,27 +594,89 @@ } ], "source": [ - "design.column_info_['ShelveLoc']" + "shelve.get_columns(Carseats)" ] }, { "cell_type": "markdown", - "id": "suspended-affairs", + "id": "269e6d18-4ae4-4a77-8498-90281ae7c803", "metadata": {}, "source": [ - "It recognized ordinal columns as well." + "Let's now fit a simple OLS model with this design." ] }, { "cell_type": "code", "execution_count": 9, - "id": "military-locking", + "id": "411238d0-dd36-4878-a869-e8ce0ada099c", "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
coefstd errtP>|t|
ShelveLoc[Bad]12.00180.50323.8390.0
ShelveLoc[Good]16.89760.52232.3860.0
ShelveLoc[Medium]13.86380.48728.4670.0
Price-0.05670.004-13.9670.0
\n", + "
" + ], + "text/plain": [ + " coef std err t P>|t|\n", + "ShelveLoc[Bad] 12.0018 0.503 23.839 0.0\n", + "ShelveLoc[Good] 16.8976 0.522 32.386 0.0\n", + "ShelveLoc[Medium] 13.8638 0.487 28.467 0.0\n", + "Price -0.0567 0.004 -13.967 0.0" ] }, "execution_count": 9, @@ -299,19 +685,166 @@ } ], "source": [ - "design.column_info_['OIncome']" + "X = MS_contr.transform(Carseats)\n", + "Y = Carseats['Sales']\n", + "M_ols = sm.OLS(Y, X).fit()\n", + "summarize(M_ols)" + ] + }, + { + "cell_type": "markdown", + "id": "40ddf68e-7d58-4e30-93a8-5b7fe840d37a", + "metadata": {}, + "source": [ + "## Interactions\n", + "\n", + "One of the common uses of formulae in `R` is to specify interactions between variables.\n", + "This is done in `ModelSpec` by including a tuple in the `terms` argument." ] }, { "cell_type": "code", "execution_count": 10, - "id": "italic-shakespeare", + "id": "3f5e314c-7a7f-4e8d-bb07-295beb42c728", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interceptShelveLoc[Bad]:PriceShelveLoc[Good]:PriceShelveLoc[Medium]:PricePrice
01.0120.00.00.0120
11.00.083.00.083
21.00.00.080.080
31.00.00.097.097
41.0128.00.00.0128
51.072.00.00.072
61.00.00.0108.0108
71.00.0120.00.0120
81.00.00.0124.0124
91.00.00.0124.0124
\n", + "
" + ], "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" + " intercept ShelveLoc[Bad]:Price ShelveLoc[Good]:Price \\\n", + "0 1.0 120.0 0.0 \n", + "1 1.0 0.0 83.0 \n", + "2 1.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 1.0 128.0 0.0 \n", + "5 1.0 72.0 0.0 \n", + "6 1.0 0.0 0.0 \n", + "7 1.0 0.0 120.0 \n", + "8 1.0 0.0 0.0 \n", + "9 1.0 0.0 0.0 \n", + "\n", + " ShelveLoc[Medium]:Price Price \n", + "0 0.0 120 \n", + "1 0.0 83 \n", + "2 80.0 80 \n", + "3 97.0 97 \n", + "4 0.0 128 \n", + "5 0.0 72 \n", + "6 108.0 108 \n", + "7 0.0 120 \n", + "8 124.0 124 \n", + "9 124.0 124 " ] }, "execution_count": 10, @@ -320,65 +853,71 @@ } ], "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" + "ModelSpec([(shelve, 'Price'), 'Price']).fit_transform(Carseats).iloc[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "3f85fcb2-f0ef-4c1b-a89f-fcf083937274", + "metadata": {}, + "source": [ + "The above design matrix is clearly rank deficient, as `ModelSpec` has not inspected the formula\n", + "and attempted to produce a corresponding matrix that may or may not match a user's intent." ] }, { "cell_type": "markdown", - "id": "medieval-speed", + "id": "excellent-hamilton", "metadata": {}, "source": [ - "## Encoding a column\n", + "## Ordinal variables\n", "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." + "Ordinal variables are handled by a corresponding encoder)" ] }, { "cell_type": "code", "execution_count": 11, - "id": "public-basket", + "id": "going-administrator", + "metadata": {}, + "outputs": [], + "source": [ + "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", + " [0,50,90,200], \n", + " labels=['L','M','H'])\n", + "MS_order = ModelSpec(['OIncome']).fit(Carseats)" + ] + }, + { + "cell_type": "markdown", + "id": "5e1defb1-071b-4751-9358-b8d2f0b3412e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" + "Part of the `fit` method of `ModelSpec` involves inspection of the columns of `Carseats`. \n", + "The results of that inspection can be found in the `column_info_` attribute:" ] }, { "cell_type": "code", "execution_count": 12, - "id": "improved-alloy", + "id": "050fb4ae-648d-429d-9cb2-8423ad9707d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" + "{'Sales': Column(idx='Sales', name='Sales', is_categorical=False, is_ordinal=False, columns=('Sales',), encoder=None),\n", + " 'CompPrice': Column(idx='CompPrice', name='CompPrice', is_categorical=False, is_ordinal=False, columns=('CompPrice',), encoder=None),\n", + " 'Income': Column(idx='Income', name='Income', is_categorical=False, is_ordinal=False, columns=('Income',), encoder=None),\n", + " 'Advertising': Column(idx='Advertising', name='Advertising', is_categorical=False, is_ordinal=False, columns=('Advertising',), encoder=None),\n", + " 'Population': Column(idx='Population', name='Population', is_categorical=False, is_ordinal=False, columns=('Population',), encoder=None),\n", + " 'Price': Column(idx='Price', name='Price', is_categorical=False, is_ordinal=False, columns=('Price',), encoder=None),\n", + " 'ShelveLoc': Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast()),\n", + " 'Age': Column(idx='Age', name='Age', is_categorical=False, is_ordinal=False, columns=('Age',), encoder=None),\n", + " 'Education': Column(idx='Education', name='Education', is_categorical=False, is_ordinal=False, columns=('Education',), encoder=None),\n", + " 'Urban': Column(idx='Urban', name='Urban', is_categorical=True, is_ordinal=False, columns=('Urban[Yes]',), encoder=Contrast()),\n", + " 'US': Column(idx='US', name='US', is_categorical=True, is_ordinal=False, columns=('US[Yes]',), encoder=Contrast()),\n", + " 'OIncome': Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())}" ] }, "execution_count": 12, @@ -387,33 +926,32 @@ } ], "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" + "MS_order.column_info_" ] }, { "cell_type": "markdown", - "id": "frank-mathematics", + "id": "debf7e2e-0a9d-451b-866c-66c0df9f43e5", "metadata": {}, "source": [ - "## The terms\n", + "## Structure of a `ModelSpec`\n", "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." + "The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood,\n", + "this sequence is inspected to produce the `terms_` attribute which specify the objects\n", + "that will ultimately create the design matrix." ] }, { "cell_type": "code", "execution_count": 13, - "id": "together-north", + "id": "ea51e988-0857-4d49-9987-d7531b34a233", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Price', 'Income']" + "[Feature(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", + " Feature(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" ] }, "execution_count": 13, @@ -422,64 +960,145 @@ } ], "source": [ - "design.terms" + "MS = ModelSpec(['ShelveLoc', 'Price'])\n", + "MS.fit(Carseats)\n", + "MS.terms_" + ] + }, + { + "cell_type": "markdown", + "id": "warming-mobile", + "metadata": {}, + "source": [ + "Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from\n", + "a columnar data form as well as possible a possible encoder." ] }, { "cell_type": "code", "execution_count": 14, - "id": "chinese-necessity", + "id": "59214a70-1e6b-41c4-9f44-a92d340723c9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "design.terms_" + "shelve_var = MS.terms_[0]" ] }, { "cell_type": "markdown", - "id": "simplified-chinese", + "id": "5fed3ea2-ff50-4e5d-819d-a948f121f9d3", "metadata": {}, "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" + "We can find the columns associated to each term using the `build_columns` method of `ModelSpec`:" ] }, { "cell_type": "code", "execution_count": 15, - "id": "automotive-hobby", + "id": "5e25ef64-497d-4f42-9f20-3d4a320cda23", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ShelveLoc[Good]ShelveLoc[Medium]
00.00.0
11.00.0
20.01.0
30.01.0
40.00.0
.........
3951.00.0
3960.01.0
3970.01.0
3980.00.0
3991.00.0
\n", + "

400 rows × 2 columns

\n", + "
" + ], "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" + " ShelveLoc[Good] ShelveLoc[Medium]\n", + "0 0.0 0.0\n", + "1 1.0 0.0\n", + "2 0.0 1.0\n", + "3 0.0 1.0\n", + "4 0.0 0.0\n", + ".. ... ...\n", + "395 1.0 0.0\n", + "396 0.0 1.0\n", + "397 0.0 1.0\n", + "398 0.0 0.0\n", + "399 1.0 0.0\n", + "\n", + "[400 rows x 2 columns]" ] }, "execution_count": 15, @@ -488,280 +1107,37 @@ } ], "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" + "df, names = build_columns(MS.column_info_,\n", + " Carseats, \n", + " shelve_var)\n", + "df" ] }, { "cell_type": "markdown", - "id": "former-spring", - "metadata": {}, - "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "floral-liabilities", + "id": "63edf7a2-e776-45b0-b434-d676d7e13dbd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" + "The design matrix is constructed by running through `terms_` and concatenating the corresponding columns." ] }, { "cell_type": "markdown", - "id": "reasonable-canadian", + "id": "former-spring", "metadata": {}, "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." + "### `Feature` objects\n", + "\n", + "Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", + "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", + "is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during \n", + "the call to `ModelSpec.fit`." ] }, { "cell_type": "code", - "execution_count": 17, - "id": "imported-measure", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "institutional-burden", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "western-bloom", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "ordinary-newman", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "modern-negotiation", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "private-shepherd", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "hollywood-union", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "suffering-lover", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "successful-express", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "banner-metadata", + "execution_count": 18, + "id": "floral-liabilities", "metadata": {}, "outputs": [ { @@ -785,1227 +1161,643 @@ " \n", " \n", " \n", - " intercept\n", " Price\n", " Income\n", + " OIncome\n", " \n", " \n", " \n", " \n", " 0\n", - " 1.0\n", - " 120\n", - " 73\n", + " 120.0\n", + " 73.0\n", + " 2.0\n", " \n", " \n", " 1\n", + " 83.0\n", + " 48.0\n", " 1.0\n", - " 83\n", - " 48\n", " \n", " \n", " 2\n", + " 80.0\n", + " 35.0\n", " 1.0\n", - " 80\n", - " 35\n", " \n", " \n", " 3\n", + " 97.0\n", + " 100.0\n", + " 0.0\n", + " \n", + " \n", + " 4\n", + " 128.0\n", + " 64.0\n", + " 2.0\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 395\n", + " 128.0\n", + " 108.0\n", + " 0.0\n", + " \n", + " \n", + " 396\n", + " 120.0\n", + " 23.0\n", + " 1.0\n", + " \n", + " \n", + " 397\n", + " 159.0\n", + " 26.0\n", + " 1.0\n", + " \n", + " \n", + " 398\n", + " 95.0\n", + " 79.0\n", + " 2.0\n", + " \n", + " \n", + " 399\n", + " 120.0\n", + " 37.0\n", " 1.0\n", - " 97\n", - " 100\n", " \n", " \n", "\n", + "

400 rows × 3 columns

\n", "" ], "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "absent-branch", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "naked-hollywood", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" + " Price Income OIncome\n", + "0 120.0 73.0 2.0\n", + "1 83.0 48.0 1.0\n", + "2 80.0 35.0 1.0\n", + "3 97.0 100.0 0.0\n", + "4 128.0 64.0 2.0\n", + ".. ... ... ...\n", + "395 128.0 108.0 0.0\n", + "396 120.0 23.0 1.0\n", + "397 159.0 26.0 1.0\n", + "398 95.0 79.0 2.0\n", + "399 120.0 37.0 1.0\n", + "\n", + "[400 rows x 3 columns]" ] }, - "execution_count": 23, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "iraqi-divorce", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" + "new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n", + "build_columns(MS.column_info_,\n", + " Carseats, \n", + " new_var)[0]" ] }, { "cell_type": "markdown", - "id": "signal-yahoo", + "id": "reasonable-canadian", "metadata": {}, "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." + "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", + "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." ] }, { "cell_type": "code", - "execution_count": 25, - "id": "completed-surveillance", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "undefined-sacrifice", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "incredible-concert", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "allied-botswana", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "stunning-container", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "specific-tobacco", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "latin-publisher", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "rocky-franchise", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "returning-matthew", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "sapphire-adelaide", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "standing-involvement", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "taken-university", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "rural-cycling", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "former-trick", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "specialized-processing", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "verified-administrator", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "limited-johns", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "saving-remainder", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "satisfied-harbor", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "silver-wesley", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "crazy-bikini", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "accredited-barrier", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "smaller-execution", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "limited-center", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "combined-relaxation", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "focal-determination", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "earned-ready", + "execution_count": 20, + "id": "imported-measure", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mynewvar[0]mynewvar[1]
0-3.595740-4.850530
115.07040135.706773
227.41222840.772377
3-33.98304813.468087
46.580644-11.287452
.........
395-36.856308-18.418138
39645.7315203.243768
39749.087659-35.727136
398-13.56517818.847760
39931.9170720.976615
\n", + "

400 rows × 2 columns

\n", + "
" + ], "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" + " mynewvar[0] mynewvar[1]\n", + "0 -3.595740 -4.850530\n", + "1 15.070401 35.706773\n", + "2 27.412228 40.772377\n", + "3 -33.983048 13.468087\n", + "4 6.580644 -11.287452\n", + ".. ... ...\n", + "395 -36.856308 -18.418138\n", + "396 45.731520 3.243768\n", + "397 49.087659 -35.727136\n", + "398 -13.565178 18.847760\n", + "399 31.917072 0.976615\n", + "\n", + "[400 rows x 2 columns]" ] }, - "execution_count": 41, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" + "from sklearn.decomposition import PCA\n", + "pca = PCA(n_components=2)\n", + "pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", + "pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n", + "build_columns(MS.column_info_,\n", + " Carseats, \n", + " pca_var)[0]" ] }, { "cell_type": "markdown", - "id": "prescribed-accessory", + "id": "institutional-burden", "metadata": {}, "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." + "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", + "or `Feature` instances (`pca_var`)." ] }, { "cell_type": "code", - "execution_count": 42, - "id": "pacific-animal", + "execution_count": 21, + "id": "western-bloom", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomePricemynewvar[0]mynewvar[1]
073.0120.0-3.595740-4.850530
148.083.015.07040135.706773
235.080.027.41222840.772377
3100.097.0-33.98304813.468087
464.0128.06.580644-11.287452
...............
395108.0128.0-36.856308-18.418138
39623.0120.045.7315203.243768
39726.0159.049.087659-35.727136
39879.095.0-13.56517818.847760
39937.0120.031.9170720.976615
\n", + "

400 rows × 4 columns

\n", + "
" + ], "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" + " Income Price mynewvar[0] mynewvar[1]\n", + "0 73.0 120.0 -3.595740 -4.850530\n", + "1 48.0 83.0 15.070401 35.706773\n", + "2 35.0 80.0 27.412228 40.772377\n", + "3 100.0 97.0 -33.983048 13.468087\n", + "4 64.0 128.0 6.580644 -11.287452\n", + ".. ... ... ... ...\n", + "395 108.0 128.0 -36.856308 -18.418138\n", + "396 23.0 120.0 45.731520 3.243768\n", + "397 26.0 159.0 49.087659 -35.727136\n", + "398 79.0 95.0 -13.565178 18.847760\n", + "399 37.0 120.0 31.917072 0.976615\n", + "\n", + "[400 rows x 4 columns]" ] }, - "execution_count": 42, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design.terms_[0]" + "price = MS.column_info_['Price']\n", + "fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None)\n", + "build_columns(MS.column_info_,\n", + " Carseats, \n", + " fancy_var)[0]" ] }, { "cell_type": "markdown", - "id": "planned-wrestling", + "id": "e289feba-e3f5-48e0-9e29-cdd88d7f9923", "metadata": {}, "source": [ - "Comparing this to the previous `R` model." + "## Predicting at new points" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "given-testimony", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "external-barrier", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "authentic-meditation", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "lucky-success", + "execution_count": 22, + "id": "6efed2fa-9e5d-429c-a8d9-ac544cab2b41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", + "intercept 12.661546\n", + "Price -0.052213\n", + "Income 0.012829\n", "dtype: float64" ] }, - "execution_count": 45, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" + "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n", + "X = MS.transform(Carseats)\n", + "Y = Carseats['Sales']\n", + "M_ols = sm.OLS(Y, X).fit()\n", + "M_ols.params" ] }, { "cell_type": "markdown", - "id": "laden-beach", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "copyrighted-luther", + "id": "e6b4609b-fcb2-4cc2-b630-509df4c87546", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" + "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n", + "Constructing the design matrix at any values is carried out by the `transform` method." ] }, { "cell_type": "code", - "execution_count": 47, - "id": "threatened-marine", + "execution_count": 23, + "id": "8784b0e8-ce53-4a90-aee6-b935834295c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3)[0] 10.036373\n", - "poly(Income, 3)[1] -2.799156\n", - "poly(Income, 3)[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" + "array([10.70130676, 10.307465 ])" ] }, - "execution_count": 47, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" + "new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]})\n", + "new_X = MS.transform(new_data)\n", + "M_ols.get_prediction(new_X).predicted_mean" ] }, { "cell_type": "markdown", - "id": "senior-spokesman", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "prompt-fifteen", + "id": "signal-yahoo", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" + "## Using `np.ndarray`\n", + "\n", + "As the basic model is to concatenate columns extracted from a columnar data\n", + "representation, one *can* use `np.ndarray` as the column data. In this case,\n", + "columns will be selected by integer indices. \n", + "\n", + "### Caveats using `np.ndarray`\n", + "\n", + "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", + "However,\n", + "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n", + "\n", + "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", + "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n", + "\n", + "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n", + "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n", + "in using `np.ndarray`." ] }, { "cell_type": "markdown", - "id": "better-christianity", + "id": "e7ffdd07-4d6b-4a4c-ab38-ad1270e85de6", "metadata": {}, "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" + "We will refit this model, using `ModelSpec` with an `np.ndarray` instead" ] }, { "cell_type": "code", - "execution_count": 49, - "id": "outstanding-performer", + "execution_count": 24, + "id": "4fec9030-7445-48be-a15f-2ac5a789e717", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "intercept 4.240421\n", - "ns(Income, df=5)[0] 1.468196\n", - "ns(Income, df=5)[1] 1.499471\n", - "ns(Income, df=5)[2] 1.152070\n", - "ns(Income, df=5)[3] 2.418398\n", - "ns(Income, df=5)[4] 1.804460\n", - "ShelveLoc[Good] 4.810449\n", - "ShelveLoc[Medium] 1.881095\n", - "dtype: float64" + "array([[ 1., 120., 73.],\n", + " [ 1., 83., 48.],\n", + " [ 1., 80., 35.],\n", + " ...,\n", + " [ 1., 159., 26.],\n", + " [ 1., 95., 79.],\n", + " [ 1., 120., 37.]])" ] }, - "execution_count": 49, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from ISLP.models.model_spec import ns, bs, pca\n", - "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "informative-spirituality", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n", - " 4.240421 1.468196 1.499471 1.152070 \n", - "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n", - " 2.418398 1.804460 4.810449 1.881095 \n" - ] - } - ], - "source": [ - "%%R\n", - "library(splines)\n", - "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef" + "Carseats_np = np.asarray(Carseats[['Price', 'Education', 'Income']])\n", + "MS_np = ModelSpec([0,2]).fit(Carseats_np)\n", + "MS_np.transform(Carseats_np)" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "destroyed-complexity", + "execution_count": 25, + "id": "c864e365-2476-4ca6-9d27-625cac2b2271", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "intercept 3.495085\n", - "bs(Income, df=7, degree=2)[0] 1.813118\n", - "bs(Income, df=7, degree=2)[1] 0.961852\n", - "bs(Income, df=7, degree=2)[2] 2.471545\n", - "bs(Income, df=7, degree=2)[3] 2.158891\n", - "bs(Income, df=7, degree=2)[4] 2.091625\n", - "bs(Income, df=7, degree=2)[5] 2.600669\n", - "bs(Income, df=7, degree=2)[6] 2.843108\n", - "ShelveLoc[Good] 4.804919\n", - "ShelveLoc[Medium] 1.880337\n", + "const 12.661546\n", + "x1 -0.052213\n", + "x2 0.012829\n", "dtype: float64" ] }, - "execution_count": 51, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "incident-nicaragua", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) bs(Income, df = 7, degree = 2)1 \n", - " 3.4950851 1.8131176 \n", - "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n", - " 0.9618523 2.4715450 \n", - "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n", - " 2.1588908 2.0916252 \n", - "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n", - " 2.6006694 2.8431084 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.8049190 1.8803375 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef" + "M_ols_np = sm.OLS(Y, MS_np.transform(Carseats_np)).fit()\n", + "M_ols_np.params" ] }, { "cell_type": "markdown", - "id": "formal-medline", - "metadata": {}, - "source": [ - "## PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "general-joshua", + "id": "undefined-sacrifice", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.419405\n", - "pca(myvars, n_components=2)[0] -0.001131\n", - "pca(myvars, n_components=2)[1] -0.024217\n", - "ShelveLoc[Good] 4.816253\n", - "ShelveLoc[Medium] 1.924139\n", - "dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars'), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" + "Now, let's consider finding the design matrix at new points. \n", + "When using `pd.DataFrame` we only need to supply the `transform` method\n", + "a data frame with columns implicated in the `terms` argument (in this case, `Price` and `Income`). \n", + "\n", + "However, when using `np.ndarray` with integers as indices, `Price` was column 0 and `Income` was column 2. The only\n", + "sensible way to produce a return for predict is to extract its 0th and 2nd columns. Note this means\n", + "that the meaning of columns in an `np.ndarray` provided to `transform` essentially must be identical to those\n", + "passed to `fit`." ] }, { "cell_type": "code", - "execution_count": 54, - "id": "coordinate-calcium", + "execution_count": 26, + "id": "incredible-concert", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.419405 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n", - " 0.001131 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n", - " -0.024217 \n", - " ShelveLocGood \n", - " 4.816253 \n", - " ShelveLocMedium \n", - " 1.924139 \n", - "\n" + "index 2 is out of bounds for axis 1 with size 2\n" ] } ], "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)" + "try:\n", + " new_D = np.array([[40,50], [10,20]]).T\n", + " new_X = MS_np.transform(new_D)\n", + "except IndexError as e:\n", + " print(e)" ] }, { "cell_type": "markdown", - "id": "foster-canvas", + "id": "allied-botswana", "metadata": {}, "source": [ - "It is of course common to scale before running PCA." + "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", + "with a matrix having 3 columns (the first representing an intercept).\n", + "\n", + "We might be tempted to try as with the `pd.DataFrame` and produce\n", + "an `np.ndarray` with only the necessary variables." ] }, { "cell_type": "code", - "execution_count": 55, - "id": "geographic-founder", + "execution_count": 27, + "id": "stunning-container", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" + "[[ 1. 40. 10.]\n", + " [ 1. 50. 20.]]\n" ] }, { "data": { "text/plain": [ - "intercept 5.352159\n", - "pca(myvars, n_components=2)[0] 0.446383\n", - "pca(myvars, n_components=2)[1] -1.219788\n", - "ShelveLoc[Good] 4.922780\n", - "ShelveLoc[Medium] 2.005617\n", - "dtype: float64" + "array([10.70130676, 10.307465 ])" ] }, - "execution_count": 55, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars',\n", - " scale=True), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "floral-packaging", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.3522 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n", - " 0.4469 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n", - " -1.2213 \n", - " ShelveLocGood \n", - " 4.9228 \n", - " ShelveLocMedium \n", - " 2.0056 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)" + "new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T\n", + "new_X = MS_np.transform(new_D)\n", + "print(new_X)\n", + "M_ols.get_prediction(new_X).predicted_mean" ] }, { "cell_type": "markdown", - "id": "social-cherry", - "metadata": {}, - "source": [ - "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n", - "of `np.std(ddof=1)`." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "another-glory", + "id": "specific-tobacco", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.44694166, -1.22131519])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))" + "For more complicated design contructions ensuring the columns of `new_D` match that of the original data will be more cumbersome. We expect\n", + "then that `pd.DataFrame` (or a columnar data representation with similar API) will likely be easier to use with `ModelSpec`." ] } ], @@ -2014,9 +1806,9 @@ "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -2028,7 +1820,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/docs/jupyterbook/models/spec.md b/docs/jupyterbook/models/spec.md index fdf8c60..27bb3a4 100644 --- a/docs/jupyterbook/models/spec.md +++ b/docs/jupyterbook/models/spec.md @@ -5,490 +5,296 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: Python 3 (ipykernel) language: python - name: islp_test + name: python3 --- # Building design matrices with `ModelSpec` -Force rebuild +The `ISLP` package provides a facility to build design +matrices for regression and classification tasks. It provides similar functionality to the formula +notation of `R` though uses python objects rather than specification through the special formula syntax. + +Related tools include `patsy` and `ColumnTransformer` from `sklearn.compose`. + +Perhaps the most common use is to extract some columns from a `pd.DataFrame` and +produce a design matrix, optionally with an intercept. ```{code-cell} ipython3 -x=4 -import numpy as np, pandas as pd -%load_ext rpy2.ipython +import pandas as pd +import numpy as np from ISLP import load_data -from ISLP.models import ModelSpec +from ISLP.models import (ModelSpec, + summarize, + Column, + Feature, + build_columns) import statsmodels.api as sm ``` ```{code-cell} ipython3 Carseats = load_data('Carseats') -%R -i Carseats Carseats.columns ``` -## Let's break up income into groups +We'll first build a design matrix that we can use to model `Sales` +in terms of the categorical variable `ShelveLoc` and `Price`. -```{code-cell} ipython3 -Carseats['OIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H']) -Carseats['OIncome'] -``` - -Let's also create an unordered version - -```{code-cell} ipython3 -Carseats['UIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H'], - ordered=False) -Carseats['UIncome'] -``` - -## A simple model - -```{code-cell} ipython3 -design = ModelSpec(['Price', 'Income']) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -Y = Carseats['Sales'] -M = sm.OLS(Y, X).fit() -M.params -``` - -## Basic procedure - -The design matrix is built by cobbling together a set of columns and possibly transforming them. -A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit` -is to inspect a dataframe for column info. The column `ShelveLoc` is categorical: +We see first that `ShelveLoc` is a categorical variable: ```{code-cell} ipython3 Carseats['ShelveLoc'] ``` -This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods -`get_columns` and `fit_encoder`. - -```{code-cell} ipython3 -design.column_info_['ShelveLoc'] -``` - -It recognized ordinal columns as well. - -```{code-cell} ipython3 -design.column_info_['OIncome'] -``` - -```{code-cell} ipython3 -income = design.column_info_['Income'] -cols, names = income.get_columns(Carseats) -(cols[:4], names) -``` - -## Encoding a column - -In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical -variables usually are encoded by several columns, typically one less than the number of categories. -This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform -model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits -its encoder the first time data is passed to it. - -```{code-cell} ipython3 -shelve = design.column_info_['ShelveLoc'] -cols, names = shelve.get_columns(Carseats) -(cols[:4], names) -``` - -```{code-cell} ipython3 -oincome = design.column_info_['OIncome'] -oincome.get_columns(Carseats)[0][:4] -``` - -## The terms +This is recognized by `ModelSpec` and only 2 columns are added for the three levels. The +default behavior is to drop the first level of the categories. Later, +we will show other contrasts of the 3 columns can be produced. -The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through -the `terms` argument which should be a sequence. The elements of `terms` are often -going to be strings (or tuples of strings for interactions, see below) but are converted to a -`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple. +This simple example below illustrates how the first argument (its `terms`) is +used to construct a design matrix. ```{code-cell} ipython3 -design.terms +MS = ModelSpec(['ShelveLoc', 'Price']) +X = MS.fit_transform(Carseats) +X.iloc[:10] ``` -```{code-cell} ipython3 -design.terms_ -``` - -While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A -`Variable` can also create columns through the `build_columns` method of `ModelSpec` +We note that a column has been added for the intercept by default. This can be changed using the +`intercept` argument. ```{code-cell} ipython3 -price = design.terms_[0] -design.build_columns(Carseats, price) +MS_no1 = ModelSpec(['ShelveLoc', 'Price'], intercept=False) +MS_no1.fit_transform(Carseats)[:10] ``` -Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The -tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then -is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during -the call to `ModelSpec.fit`. +We see that `ShelveLoc` still only contributes +two columns to the design. The `ModelSpec` object does no introspection of its arguments to effectively include an intercept term +in the column space of the design matrix. -```{code-cell} ipython3 -from ISLP.models.model_spec import Variable - -new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None) -design.build_columns(Carseats, new_var) -``` - -Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the -arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`. +To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of +`R`, we call this a `Contrast` of the categorical variable. ```{code-cell} ipython3 -from sklearn.decomposition import PCA -pca = PCA(n_components=2) -pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit` -pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca) -design.build_columns(Carseats, pca_var) -``` - -The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`) -or `Variable` instances (`pca_var`). - -```{code-cell} ipython3 -fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None) -design.build_columns(Carseats, fancy_var) +from ISLP.models import contrast +shelve = contrast('ShelveLoc', None) +MS_contr = ModelSpec([shelve, 'Price'], intercept=False) +MS_contr.fit_transform(Carseats)[:10] ``` -We can of course run PCA again on these features (if we wanted). +This example above illustrates that columns need not be identified by name in `terms`. The basic +role of an item in the `terms` sequence is a description of how to extract a column +from a columnar data object, usually a `pd.DataFrame`. ```{code-cell} ipython3 -pca2 = PCA(n_components=2) -pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit` -pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2) -design.build_columns(Carseats, pca2_var) +shelve ``` -## Building the design matrix - -With these notions in mind, the final design is essentially then +The `Column` object can be used to directly extract relevant columns from a `pd.DataFrame`. If the `encoder` field is not +`None`, then the extracted columns will be passed through `encoder`. +The `get_columns` method produces these columns as well as names for the columns. ```{code-cell} ipython3 -X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4] +shelve.get_columns(Carseats) ``` -An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is -a dataframe the index is adjusted accordingly. +Let's now fit a simple OLS model with this design. ```{code-cell} ipython3 -design.intercept -``` - -```{code-cell} ipython3 -design.transform(Carseats)[:4] +X = MS_contr.transform(Carseats) +Y = Carseats['Sales'] +M_ols = sm.OLS(Y, X).fit() +summarize(M_ols) ``` -## Predicting +## Interactions -Constructing the design matrix at any values is carried out by the `transform` method. +One of the common uses of formulae in `R` is to specify interactions between variables. +This is done in `ModelSpec` by including a tuple in the `terms` argument. ```{code-cell} ipython3 -new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]}) -new_X = design.transform(new_data) -M.get_prediction(new_X).predicted_mean +ModelSpec([(shelve, 'Price'), 'Price']).fit_transform(Carseats).iloc[:10] ``` -```{code-cell} ipython3 -%%R -i new_data,Carseats -predict(lm(Sales ~ Price + Income, data=Carseats), new_data) -``` +The above design matrix is clearly rank deficient, as `ModelSpec` has not inspected the formula +and attempted to produce a corresponding matrix that may or may not match a user's intent. -### Difference between using `pd.DataFrame` and `np.ndarray` ++++ -If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns. +## Ordinal variables -If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so, -in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. +Ordinal variables are handled by a corresponding encoder) ```{code-cell} ipython3 -Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']]) -design_np = ModelSpec([0,3]).fit(Carseats_np) -design_np.transform(Carseats_np)[:4] +Carseats['OIncome'] = pd.cut(Carseats['Income'], + [0,50,90,200], + labels=['L','M','H']) +MS_order = ModelSpec(['OIncome']).fit(Carseats) ``` -The following will fail for hopefully obvious reasons +Part of the `fit` method of `ModelSpec` involves inspection of the columns of `Carseats`. +The results of that inspection can be found in the `column_info_` attribute: ```{code-cell} ipython3 -try: - new_D = np.zeros((2,2)) - new_D[:,0] = [10,20] - new_D[:,1] = [40,50] - M.get_prediction(new_D).predicted_mean -except ValueError as e: - print(e) +MS_order.column_info_ ``` -Ultimately, `M` expects 3 columns for new predictions because it was fit -with a matrix having 3 columns (the first representing an intercept). +## Structure of a `ModelSpec` -We might be tempted to try as with the `pd.DataFrame` and produce -an `np.ndarray` with only the necessary variables. +The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood, +this sequence is inspected to produce the `terms_` attribute which specify the objects +that will ultimately create the design matrix. ```{code-cell} ipython3 -try: - new_X = np.zeros((2,2)) - new_X[:,0] = [10,20] - new_X[:,1] = [40,50] - new_D = design_np.transform(new_X) - M.get_prediction(new_D).predicted_mean -except IndexError as e: - print(e) +MS = ModelSpec(['ShelveLoc', 'Price']) +MS.fit(Carseats) +MS.terms_ ``` -This fails because `design_np` is looking for column `3` from its `terms`: +Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from +a columnar data form as well as possible a possible encoder. ```{code-cell} ipython3 -design_np.terms_ +shelve_var = MS.terms_[0] ``` -However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed -represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`: +We can find the columns associated to each term using the `build_columns` method of `ModelSpec`: ```{code-cell} ipython3 -new_X = np.zeros((2,4)) -new_X[:,0] = [10,20] -new_X[:,3] = [40,50] -new_D = design_np.transform(new_X) -M.get_prediction(new_D).predicted_mean +df, names = build_columns(MS.column_info_, + Carseats, + shelve_var) +df ``` -Given this subtlety about needing to supply arrays with identical column structure to `transform` when -using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case. +The design matrix is constructed by running through `terms_` and concatenating the corresponding columns. +++ -## A model with some categorical variables +### `Feature` objects -Categorical variables become `Column` instances with encoders. - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats) -design.column_info_['UIncome'] -``` - -```{code-cell} ipython3 -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Getting the encoding you want - -By default the level dropped by `ModelSpec` will be the first of the `categories_` values from -`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems -as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)` -where `new_encoder` would somehow drop the column we want dropped. - -However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`: - -```{code-cell} ipython3 -design.column_info_['UIncome'] -``` - -This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when -we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest -and we need a way to allow different encodings of the same column of `Carseats` - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - - We can create a new -`Column` with the encoder we want. For categorical variables, there is a convenience function to do so. - -```{code-cell} ipython3 -from ISLP.models.model_spec import contrast -pref_encoding = contrast('UIncome', 'drop', 'L') -``` - -```{code-cell} ipython3 -design.build_columns(Carseats, pref_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` +Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The +tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then +is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during +the call to `ModelSpec.fit`. ```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef +new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None) +build_columns(MS.column_info_, + Carseats, + new_var)[0] ``` -## Interactions - -We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument -to `ModelSpec`. +Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the +arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`. ```{code-cell} ipython3 -design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params +from sklearn.decomposition import PCA +pca = PCA(n_components=2) +pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit` +pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca) +build_columns(MS.column_info_, + Carseats, + pca_var)[0] ``` -The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with -`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`. +The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`) +or `Feature` instances (`pca_var`). ```{code-cell} ipython3 -design.terms_[0] +price = MS.column_info_['Price'] +fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None) +build_columns(MS.column_info_, + Carseats, + fancy_var)[0] ``` -Comparing this to the previous `R` model. +## Predicting at new points ```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) +MS = ModelSpec(['Price', 'Income']).fit(Carseats) +X = MS.transform(Carseats) +Y = Carseats['Sales'] +M_ols = sm.OLS(Y, X).fit() +M_ols.params ``` -We note a few important things: - -1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these -columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!** - -2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.** - -A few notes: - -- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.** - -- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily. - -```{code-cell} ipython3 -full_encoding = contrast('UIncome', None) -design.build_columns(Carseats, full_encoding) -``` +As `ModelSpec` is a transformer, it can be evaluated at new feature values. +Constructing the design matrix at any values is carried out by the `transform` method. ```{code-cell} ipython3 -design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params +new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]}) +new_X = MS.transform(new_data) +M_ols.get_prediction(new_X).predicted_mean ``` -## Special encodings - -For flexible models, we may want to consider transformations of features, i.e. polynomial -or spline transformations. Given transforms that follow the `fit/transform` paradigm -we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms` -package includes a `Poly` transform - -```{code-cell} ipython3 -from ISLP.models.model_spec import poly -poly('Income', 3) -``` +## Using `np.ndarray` -```{code-cell} ipython3 -design = ModelSpec([poly('Income', 3), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` +As the basic model is to concatenate columns extracted from a columnar data +representation, one *can* use `np.ndarray` as the column data. In this case, +columns will be selected by integer indices. -Compare: +### Caveats using `np.ndarray` -```{code-cell} ipython3 -%%R -lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef -``` +If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns. +However, +unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues. -## Splines +However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so, +in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. -Support for natural and B-splines is also included +We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new +values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties +in using `np.ndarray`. -```{code-cell} ipython3 -from ISLP.models.model_spec import ns, bs, pca -design = ModelSpec([ns('Income', df=5), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` ++++ -```{code-cell} ipython3 -%%R -library(splines) -lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef -``` +We will refit this model, using `ModelSpec` with an `np.ndarray` instead ```{code-cell} ipython3 -design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params +Carseats_np = np.asarray(Carseats[['Price', 'Education', 'Income']]) +MS_np = ModelSpec([0,2]).fit(Carseats_np) +MS_np.transform(Carseats_np) ``` ```{code-cell} ipython3 -%%R -lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef +M_ols_np = sm.OLS(Y, MS_np.transform(Carseats_np)).fit() +M_ols_np.params ``` -## PCA +Now, let's consider finding the design matrix at new points. +When using `pd.DataFrame` we only need to supply the `transform` method +a data frame with columns implicated in the `terms` argument (in this case, `Price` and `Income`). -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars'), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` +However, when using `np.ndarray` with integers as indices, `Price` was column 0 and `Income` was column 2. The only +sensible way to produce a return for predict is to extract its 0th and 2nd columns. Note this means +that the meaning of columns in an `np.ndarray` provided to `transform` essentially must be identical to those +passed to `fit`. ```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats) +try: + new_D = np.array([[40,50], [10,20]]).T + new_X = MS_np.transform(new_D) +except IndexError as e: + print(e) ``` -It is of course common to scale before running PCA. +Ultimately, `M` expects 3 columns for new predictions because it was fit +with a matrix having 3 columns (the first representing an intercept). -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars', - scale=True), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` +We might be tempted to try as with the `pd.DataFrame` and produce +an `np.ndarray` with only the necessary variables. ```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats) +new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T +new_X = MS_np.transform(new_D) +print(new_X) +M_ols.get_prediction(new_X).predicted_mean ``` -There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead -of `np.std(ddof=1)`. - -```{code-cell} ipython3 -np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1)) -``` +For more complicated design contructions ensuring the columns of `new_D` match that of the original data will be more cumbersome. We expect +then that `pd.DataFrame` (or a columnar data representation with similar API) will likely be easier to use with `ModelSpec`. diff --git a/docs/jupyterbook/models/submodels.ipynb b/docs/jupyterbook/models/submodels.ipynb deleted file mode 100644 index 777037a..0000000 --- a/docs/jupyterbook/models/submodels.ipynb +++ /dev/null @@ -1,3127 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ee33d364", - "metadata": {}, - "source": [ - "# Building design matrices with `ModelSpec`\n", - "\n", - "Force rebuild" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "4c70fbaa", - "metadata": {}, - "outputs": [], - "source": [ - "x=4\n", - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", - "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8a708215", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "dad5e991", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ac7086a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "261446c8", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "674bb806", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "8f030039", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "40cd6c28", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e65f5607", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "29d9b55f", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cfbe5b92", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "7092f666", - "metadata": {}, - "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e2d43844", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "46a01612", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "465a9326", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['OIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "76f8480d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "markdown", - "id": "25fcc1de", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "dfe6cc35", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8fc9779a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" - ] - }, - { - "cell_type": "markdown", - "id": "8e04da60", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c579dbce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4587b8bd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "2595f0fa", - "metadata": {}, - "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "03bd9366", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" - ] - }, - { - "cell_type": "markdown", - "id": "de04ca48", - "metadata": {}, - "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a42af4c5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" - ] - }, - { - "cell_type": "markdown", - "id": "b146d0c0", - "metadata": {}, - "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b6c394a6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "3bb30a3f", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "ea7770ff", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "b2b4a01a", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "21ad8b44", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "2262377d", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "1654ca47", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "1db0e0a9", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "d20e8ea8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "450fe910", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "0705ba6f", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "866c2863", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "f2021166", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "20e1a31a", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "a5926ec9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "997a63cb", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "40410c48", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "920203e9", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "1061da77", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "c6bfe001", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "5ae6d25f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "edd7ebeb", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "9455e532", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "fd726791", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "967d9ebc", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d0429b56", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "415e3fd0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "8a99c3a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "9250a28a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "fe90c12c", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "0546ec84", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "6ec4fe65", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "61e7f56e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "802ed854", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "82d7a01d", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "e26849a1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "2fc4cd8c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "49e33d41", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "ce018fdf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "2d42b822", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "fbb3e3ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "f9a7d4ad", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "5a6f8e69", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "98eef5c8", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "58c99601", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "9c979d7e", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "0cb3b63a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "272098d7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "fe05c471", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "67062299", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "df5e5b4d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3, )[0] 10.036373\n", - "poly(Income, 3, )[1] -2.799156\n", - "poly(Income, 3, )[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "01be9c13", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "3244d6f6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "8ad5bb1d", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "6a6f4358", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 4.240421\n", - "ns(Income, , df=5)[0] 1.468196\n", - "ns(Income, , df=5)[1] 1.499471\n", - "ns(Income, , df=5)[2] 1.152070\n", - "ns(Income, , df=5)[3] 2.418398\n", - "ns(Income, , df=5)[4] 1.804460\n", - "ShelveLoc[Good] 4.810449\n", - "ShelveLoc[Medium] 1.881095\n", - "dtype: float64" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca\n", - "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "fb740953", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n", - " 4.240421 1.468196 1.499471 1.152070 \n", - "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n", - " 2.418398 1.804460 4.810449 1.881095 \n" - ] - } - ], - "source": [ - "%%R\n", - "library(splines)\n", - "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "fe1bf7fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 3.495085\n", - "bs(Income, , df=7, degree=2)[0] 1.813118\n", - "bs(Income, , df=7, degree=2)[1] 0.961852\n", - "bs(Income, , df=7, degree=2)[2] 2.471545\n", - "bs(Income, , df=7, degree=2)[3] 2.158891\n", - "bs(Income, , df=7, degree=2)[4] 2.091625\n", - "bs(Income, , df=7, degree=2)[5] 2.600669\n", - "bs(Income, , df=7, degree=2)[6] 2.843108\n", - "ShelveLoc[Good] 4.804919\n", - "ShelveLoc[Medium] 1.880337\n", - "dtype: float64" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "86e966e0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) bs(Income, df = 7, degree = 2)1 \n", - " 3.4950851 1.8131176 \n", - "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n", - " 0.9618523 2.4715450 \n", - "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n", - " 2.1588908 2.0916252 \n", - "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n", - " 2.6006694 2.8431084 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.8049190 1.8803375 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "877d4784", - "metadata": {}, - "source": [ - "## PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "8ba6cb20", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.419405\n", - "pca(myvars, , n_components=2)[0] -0.001131\n", - "pca(myvars, , n_components=2)[1] -0.024217\n", - "ShelveLoc[Good] 4.816253\n", - "ShelveLoc[Medium] 1.924139\n", - "dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars'), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "f0319e51", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.419405 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n", - " 0.001131 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n", - " -0.024217 \n", - " ShelveLocGood \n", - " 4.816253 \n", - " ShelveLocMedium \n", - " 1.924139 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "1f55086a", - "metadata": {}, - "source": [ - "It is of course common to scale before running PCA." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "bbe9e004", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.352159\n", - "pca(myvars, , n_components=2)[0] 0.446383\n", - "pca(myvars, , n_components=2)[1] -1.219788\n", - "ShelveLoc[Good] 4.922780\n", - "ShelveLoc[Medium] 2.005617\n", - "dtype: float64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars',\n", - " scale=True), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "d78c02e4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.3522 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n", - " 0.4469 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n", - " -1.2213 \n", - " ShelveLocGood \n", - " 4.9228 \n", - " ShelveLocMedium \n", - " 2.0056 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "8a03c603", - "metadata": {}, - "source": [ - "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n", - "of `np.std(ddof=1)`." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "f8215cef", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.44694166, -1.22131519])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))" - ] - }, - { - "cell_type": "markdown", - "id": "a15d0ead", - "metadata": {}, - "source": [ - "## Submodels\n", - "\n", - "We can build submodels as well, even if the terms do not appear in the original `terms` argument.\n", - "Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be\n", - "able to build a design matrix. The initial inspection of the columns of `Carseats` has created\n", - "a column for `US`, hence we can build this submodel." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "d58c6244", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptUS[Yes]
01.01.0
11.01.0
21.01.0
31.01.0
41.00.0
.........
3951.01.0
3961.01.0
3971.01.0
3981.01.0
3991.01.0
\n", - "

400 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " intercept US[Yes]\n", - "0 1.0 1.0\n", - "1 1.0 1.0\n", - "2 1.0 1.0\n", - "3 1.0 1.0\n", - "4 1.0 0.0\n", - ".. ... ...\n", - "395 1.0 1.0\n", - "396 1.0 1.0\n", - "397 1.0 1.0\n", - "398 1.0 1.0\n", - "399 1.0 1.0\n", - "\n", - "[400 rows x 2 columns]" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats)\n", - "design.build_submodel(Carseats, ['US'])" - ] - }, - { - "cell_type": "markdown", - "id": "9365ba27", - "metadata": {}, - "source": [ - "## ANOVA \n", - "\n", - "For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "332ab454", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['intercept'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n", - " 'UIncome[L]', 'UIncome[M]'],\n", - " dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n", - " 'UIncome[L]', 'UIncome[M]', 'US[Yes]'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats)\n", - "for D in design.build_sequence(Carseats):\n", - " print(D.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "f6cfd031", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.02172.7435552.01009.531143153.0108585.452815e-50
2396.01455.6407021.0717.102853217.3771921.583751e-39
3394.01378.9159382.076.72476411.6288851.239031e-05
4393.01296.4627001.082.45323824.9942578.678832e-07
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 2172.743555 2.0 1009.531143 153.010858 5.452815e-50\n", - "2 396.0 1455.640702 1.0 717.102853 217.377192 1.583751e-39\n", - "3 394.0 1378.915938 2.0 76.724764 11.628885 1.239031e-05\n", - "4 393.0 1296.462700 1.0 82.453238 24.994257 8.678832e-07" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "11c4aee8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "ShelveLoc 2 1009.53 504.77 153.011 < 2.2e-16 ***\n", - "Price 1 717.10 717.10 217.377 < 2.2e-16 ***\n", - "UIncome 2 76.72 38.36 11.629 1.240e-05 ***\n", - "US 1 82.45 82.45 24.994 8.679e-07 ***\n", - "Residuals 393 1296.46 3.30 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "9a4e6e63", - "metadata": {}, - "source": [ - "Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of \n", - "interaction as `R` does:" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "6e7bf361", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1393.02059.3764136.01122.89828435.9400471.175738e-34
2391.02036.0445962.023.3318172.2403101.077900e-01
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 393.0 2059.376413 6.0 1122.898284 35.940047 1.175738e-34\n", - "2 391.0 2036.044596 2.0 23.331817 2.240310 1.077900e-01" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats)\n", - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "ed7d4bfa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "UIncome 2 61.92 30.962 5.9458 0.002859 ** \n", - "UIncome:ShelveLoc 6 1084.31 180.718 34.7049 < 2.2e-16 ***\n", - "Residuals 391 2036.04 5.207 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "0350da34", - "metadata": {}, - "source": [ - "To agree with `R` we must order `terms` as `R` will." - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "5ddaf87c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.03120.3513822.061.9233165.9458462.855424e-03
2391.02036.0445966.01084.30678534.7048681.346561e-33
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 3120.351382 2.0 61.923316 5.945846 2.855424e-03\n", - "2 391.0 2036.044596 6.0 1084.306785 34.704868 1.346561e-33" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats)\n", - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "1ef70ce3", - "metadata": {}, - "source": [ - "## More complicated interactions\n", - "\n", - "Can we have an interaction of a polynomial effect with a categorical? Absolutely" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "a1a14742", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "UIncome 2 61.92 30.9617 4.0310 0.01851 *\n", - "UIncome:poly(Income, 3) 9 79.72 8.8581 1.1533 0.32408 \n", - "UIncome:US 3 83.51 27.8367 3.6242 0.01324 *\n", - "Residuals 385 2957.12 7.6808 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "a909be1a", - "metadata": {}, - "source": [ - "To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels\n", - "for the two interactions." - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "ae286cf3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 65.978856\n", - "UIncome[M] -60.159607\n", - "UIncome[H] -147.276154\n", - "poly(Income, 3, )[0]:UIncome[H] 1957.694387\n", - "poly(Income, 3, )[0]:UIncome[L] 1462.060650\n", - "poly(Income, 3, )[0]:UIncome[M] 83.035153\n", - "poly(Income, 3, )[1]:UIncome[H] -984.494570\n", - "poly(Income, 3, )[1]:UIncome[L] 881.537647\n", - "poly(Income, 3, )[1]:UIncome[M] -18.006234\n", - "poly(Income, 3, )[2]:UIncome[H] 207.614692\n", - "poly(Income, 3, )[2]:UIncome[L] 217.190749\n", - "poly(Income, 3, )[2]:UIncome[M] 34.065434\n", - "UIncome[H]:US 0.903404\n", - "UIncome[L]:US 0.895538\n", - "UIncome[M]:US 1.048728\n", - "dtype: float64" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p3 = poly('Income', 3)\n", - "design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats)\n", - "X = design.transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "236ab2d2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.03120.3513822.061.9233164.0310320.018488
2388.03040.6285599.079.7228231.1532730.324049
3385.02957.1184443.083.5101153.6241810.013244
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 3120.351382 2.0 61.923316 4.031032 0.018488\n", - "2 388.0 3040.628559 9.0 79.722823 1.153273 0.324049\n", - "3 385.0 2957.118444 3.0 83.510115 3.624181 0.013244" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "0a45c720", - "metadata": {}, - "source": [ - "## Grouping columns for ANOVA\n", - "\n", - "The `Variable` construct can be used to group\n", - "variables together to get custom sequences of models for `anova_lm`." - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "f36c1b3b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['intercept'], dtype='object')\n", - "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]'], dtype='object')\n", - "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]', 'US[Yes]',\n", - " 'Advertising'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "group1 = Variable(('Price', pref_encoding), 'group1', None)\n", - "group2 = Variable(('US', 'Advertising'), 'group2', None)\n", - "design = ModelSpec([group1, group2]).fit(Carseats)\n", - "for D in design.build_sequence(Carseats):\n", - " print(D.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "3daf7638", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1396.02508.1877883.0674.08691039.3048412.970412e-22
2394.02252.3963432.0255.79144522.3721356.267562e-10
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 396.0 2508.187788 3.0 674.086910 39.304841 2.970412e-22\n", - "2 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "46c1ace8", - "metadata": {}, - "source": [ - "It is not clear this is simple to do in `R` as the formula object expands all parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "0b87e430", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "Price 1 630.03 630.03 110.2079 < 2.2e-16 ***\n", - "UIncome 2 44.06 22.03 3.8533 0.02201 * \n", - "US 1 121.88 121.88 21.3196 5.270e-06 ***\n", - "Advertising 1 133.91 133.91 23.4247 1.868e-06 ***\n", - "Residuals 394 2252.40 5.72 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "7c137360", - "metadata": {}, - "source": [ - "It can be done by building up the models\n", - "by hand and likely is possible to be done programmatically but it seems not obvious." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "b678d323", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ 1\n", - "Model 2: Sales ~ Price + UIncome\n", - "Model 3: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 399 3182.3 \n", - "2 396 2508.2 3 674.09 39.305 < 2.2e-16 ***\n", - "3 394 2252.4 2 255.79 22.372 6.268e-10 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ 1, data=Carseats)\n", - "M2 = lm(Sales ~ Price + UIncome, data=Carseats)\n", - "M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "anova(M1, M2, M3)" - ] - }, - { - "cell_type": "markdown", - "id": "b0388949", - "metadata": {}, - "source": [ - "## Alternative anova\n", - "\n", - "Another common ANOVA table involves dropping each term in succession from the model and comparing\n", - "to the full model." - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "ac5b916a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'intercept'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 395.0 4417.273517 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 1.0 2164.877175 378.690726 1.359177e-59\n", - "{'Price', 'UIncome[H]', 'UIncome[M]'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 397.0 2950.808154 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 3.0 698.411811 40.723184 6.077848e-23\n", - "{'US[Yes]', 'Advertising'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 396.0 2508.187788 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10\n" - ] - } - ], - "source": [ - "Dfull = design.transform(Carseats)\n", - "Mfull = sm.OLS(Y, Dfull).fit()\n", - "for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')):\n", - " if i == 0:\n", - " D0 = D\n", - " print(set(D.columns) ^ set(Dfull.columns))\n", - " print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull))" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "a0c71948", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ US + Advertising\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 397 2950.8 \n", - "2 394 2252.4 3 698.41 40.723 < 2.2e-16 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ Price + UIncome\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 396 2508.2 \n", - "2 394 2252.4 2 255.79 22.372 6.268e-10 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "M2 = lm(Sales ~ US + Advertising, data=Carseats)\n", - "print(anova(M2, M1))\n", - "M3 = lm(Sales ~ Price + UIncome, data=Carseats)\n", - "print(anova(M3, M1))" - ] - }, - { - "cell_type": "markdown", - "id": "a5e4880d", - "metadata": {}, - "source": [ - "The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection\n", - "of the formula." - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "4b383401", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ Price + UIncome + US + Advertising - 1\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F)\n", - "1 394 2252.4 \n", - "2 394 2252.4 0 9.0949e-13 \n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats)\n", - "print(anova(M4, M1))" - ] - }, - { - "cell_type": "markdown", - "id": "72d7c83b", - "metadata": {}, - "source": [ - "It can be found with `summary`." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "4d5ce789", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ Price + UIncome + US + Advertising, data = Carseats)\n", - "\n", - "Residuals:\n", - " Min 1Q Median 3Q Max \n", - "-7.4437 -1.6351 -0.0932 1.4920 6.8076 \n", - "\n", - "Coefficients:\n", - " Estimate Std. Error t value Pr(>|t|) \n", - "(Intercept) 12.520356 0.643390 19.460 < 2e-16 ***\n", - "Price -0.054000 0.005072 -10.647 < 2e-16 ***\n", - "UIncomeM 0.548906 0.281693 1.949 0.0521 . \n", - "UIncomeH 0.708219 0.322028 2.199 0.0284 * \n", - "USYes 0.024181 0.343246 0.070 0.9439 \n", - "Advertising 0.119509 0.024692 4.840 1.87e-06 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", - "\n", - "Residual standard error: 2.391 on 394 degrees of freedom\n", - "Multiple R-squared: 0.2922,\tAdjusted R-squared: 0.2832 \n", - "F-statistic: 32.53 on 5 and 394 DF, p-value: < 2.2e-16\n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "summary(M1)" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "56b82d02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(378.690726, 378.69160000000005)" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "378.690726, 19.46**2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "872f645c-1d6f-4d08-9eec-2b80276bc82c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" - }, - "kernelspec": { - "display_name": "islp_test", - "language": "python", - "name": "islp_test" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/jupyterbook/models/submodels.md b/docs/jupyterbook/models/submodels.md deleted file mode 100644 index c2a97fd..0000000 --- a/docs/jupyterbook/models/submodels.md +++ /dev/null @@ -1,652 +0,0 @@ ---- -jupytext: - formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.1 -kernelspec: - display_name: islp_test - language: python - name: islp_test ---- - -# Building design matrices with `ModelSpec` - -Force rebuild - -```{code-cell} ipython3 -x=4 -import numpy as np, pandas as pd -%load_ext rpy2.ipython - -from ISLP import load_data -from ISLP.models import ModelSpec - -import statsmodels.api as sm -``` - -```{code-cell} ipython3 -Carseats = load_data('Carseats') -%R -i Carseats -Carseats.columns -``` - -## Let's break up income into groups - -```{code-cell} ipython3 -Carseats['OIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H']) -Carseats['OIncome'] -``` - -Let's also create an unordered version - -```{code-cell} ipython3 -Carseats['UIncome'] = pd.cut(Carseats['Income'], - [0,50,90,200], - labels=['L','M','H'], - ordered=False) -Carseats['UIncome'] -``` - -## A simple model - -```{code-cell} ipython3 -design = ModelSpec(['Price', 'Income']) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -Y = Carseats['Sales'] -M = sm.OLS(Y, X).fit() -M.params -``` - -## Basic procedure - -The design matrix is built by cobbling together a set of columns and possibly transforming them. -A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit` -is to inspect a dataframe for column info. The column `ShelveLoc` is categorical: - -```{code-cell} ipython3 -Carseats['ShelveLoc'] -``` - -This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods -`get_columns` and `fit_encoder`. - -```{code-cell} ipython3 -design.column_info_['ShelveLoc'] -``` - -It recognized ordinal columns as well. - -```{code-cell} ipython3 -design.column_info_['OIncome'] -``` - -```{code-cell} ipython3 -income = design.column_info_['Income'] -cols, names = income.get_columns(Carseats) -(cols[:4], names) -``` - -## Encoding a column - -In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical -variables usually are encoded by several columns, typically one less than the number of categories. -This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform -model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits -its encoder the first time data is passed to it. - -```{code-cell} ipython3 -shelve = design.column_info_['ShelveLoc'] -cols, names = shelve.get_columns(Carseats) -(cols[:4], names) -``` - -```{code-cell} ipython3 -oincome = design.column_info_['OIncome'] -oincome.get_columns(Carseats)[0][:4] -``` - -## The terms - -The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through -the `terms` argument which should be a sequence. The elements of `terms` are often -going to be strings (or tuples of strings for interactions, see below) but are converted to a -`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple. - -```{code-cell} ipython3 -design.terms -``` - -```{code-cell} ipython3 -design.terms_ -``` - -While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A -`Variable` can also create columns through the `build_columns` method of `ModelSpec` - -```{code-cell} ipython3 -price = design.terms_[0] -design.build_columns(Carseats, price) -``` - -Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The -tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then -is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during -the call to `ModelSpec.fit`. - -```{code-cell} ipython3 -from ISLP.models.model_spec import Variable - -new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None) -design.build_columns(Carseats, new_var) -``` - -Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the -arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`. - -```{code-cell} ipython3 -from sklearn.decomposition import PCA -pca = PCA(n_components=2) -pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit` -pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca) -design.build_columns(Carseats, pca_var) -``` - -The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`) -or `Variable` instances (`pca_var`). - -```{code-cell} ipython3 -fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None) -design.build_columns(Carseats, fancy_var) -``` - -We can of course run PCA again on these features (if we wanted). - -```{code-cell} ipython3 -pca2 = PCA(n_components=2) -pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit` -pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2) -design.build_columns(Carseats, pca2_var) -``` - -## Building the design matrix - -With these notions in mind, the final design is essentially then - -```{code-cell} ipython3 -X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4] -``` - -An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is -a dataframe the index is adjusted accordingly. - -```{code-cell} ipython3 -design.intercept -``` - -```{code-cell} ipython3 -design.transform(Carseats)[:4] -``` - -## Predicting - -Constructing the design matrix at any values is carried out by the `transform` method. - -```{code-cell} ipython3 -new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]}) -new_X = design.transform(new_data) -M.get_prediction(new_X).predicted_mean -``` - -```{code-cell} ipython3 -%%R -i new_data,Carseats -predict(lm(Sales ~ Price + Income, data=Carseats), new_data) -``` - -### Difference between using `pd.DataFrame` and `np.ndarray` - -If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns. - -If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so, -in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. - -```{code-cell} ipython3 -Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']]) -design_np = ModelSpec([0,3]).fit(Carseats_np) -design_np.transform(Carseats_np)[:4] -``` - -The following will fail for hopefully obvious reasons - -```{code-cell} ipython3 -try: - new_D = np.zeros((2,2)) - new_D[:,0] = [10,20] - new_D[:,1] = [40,50] - M.get_prediction(new_D).predicted_mean -except ValueError as e: - print(e) -``` - -Ultimately, `M` expects 3 columns for new predictions because it was fit -with a matrix having 3 columns (the first representing an intercept). - -We might be tempted to try as with the `pd.DataFrame` and produce -an `np.ndarray` with only the necessary variables. - -```{code-cell} ipython3 -try: - new_X = np.zeros((2,2)) - new_X[:,0] = [10,20] - new_X[:,1] = [40,50] - new_D = design_np.transform(new_X) - M.get_prediction(new_D).predicted_mean -except IndexError as e: - print(e) -``` - -This fails because `design_np` is looking for column `3` from its `terms`: - -```{code-cell} ipython3 -design_np.terms_ -``` - -However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed -represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`: - -```{code-cell} ipython3 -new_X = np.zeros((2,4)) -new_X[:,0] = [10,20] -new_X[:,3] = [40,50] -new_D = design_np.transform(new_X) -M.get_prediction(new_D).predicted_mean -``` - -Given this subtlety about needing to supply arrays with identical column structure to `transform` when -using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case. - -+++ - -## A model with some categorical variables - -Categorical variables become `Column` instances with encoders. - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats) -design.column_info_['UIncome'] -``` - -```{code-cell} ipython3 -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Getting the encoding you want - -By default the level dropped by `ModelSpec` will be the first of the `categories_` values from -`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems -as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)` -where `new_encoder` would somehow drop the column we want dropped. - -However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`: - -```{code-cell} ipython3 -design.column_info_['UIncome'] -``` - -This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when -we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest -and we need a way to allow different encodings of the same column of `Carseats` - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - - We can create a new -`Column` with the encoder we want. For categorical variables, there is a convenience function to do so. - -```{code-cell} ipython3 -from ISLP.models.model_spec import contrast -pref_encoding = contrast('UIncome', 'drop', 'L') -``` - -```{code-cell} ipython3 -design.build_columns(Carseats, pref_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats) -X = design.fit_transform(Carseats) -X.columns -``` - -```{code-cell} ipython3 -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef -``` - -## Interactions - -We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument -to `ModelSpec`. - -```{code-cell} ipython3 -design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with -`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`. - -```{code-cell} ipython3 -design.terms_[0] -``` - -Comparing this to the previous `R` model. - -```{code-cell} ipython3 -%%R -lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats) -``` - -We note a few important things: - -1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these -columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!** - -2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.** - -A few notes: - -- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.** - -- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily. - -```{code-cell} ipython3 -full_encoding = contrast('UIncome', None) -design.build_columns(Carseats, full_encoding) -``` - -```{code-cell} ipython3 -design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -## Special encodings - -For flexible models, we may want to consider transformations of features, i.e. polynomial -or spline transformations. Given transforms that follow the `fit/transform` paradigm -we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms` -package includes a `Poly` transform - -```{code-cell} ipython3 -from ISLP.models.model_spec import poly -poly('Income', 3) -``` - -```{code-cell} ipython3 -design = ModelSpec([poly('Income', 3), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -Compare: - -```{code-cell} ipython3 -%%R -lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef -``` - -## Splines - -Support for natural and B-splines is also included - -```{code-cell} ipython3 -from ISLP.models.model_spec import ns, bs, pca -design = ModelSpec([ns('Income', df=5), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -library(splines) -lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef -``` - -```{code-cell} ipython3 -design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef -``` - -## PCA - -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars'), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats) -``` - -It is of course common to scale before running PCA. - -```{code-cell} ipython3 -design = ModelSpec([pca(['Income', - 'Price', - 'Advertising', - 'Population'], - n_components=2, - name='myvars', - scale=True), 'ShelveLoc']) -X = design.fit_transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -%%R -lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats) -``` - -There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead -of `np.std(ddof=1)`. - -```{code-cell} ipython3 -np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1)) -``` - -## Submodels - -We can build submodels as well, even if the terms do not appear in the original `terms` argument. -Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be -able to build a design matrix. The initial inspection of the columns of `Carseats` has created -a column for `US`, hence we can build this submodel. - -```{code-cell} ipython3 -design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats) -design.build_submodel(Carseats, ['US']) -``` - -## ANOVA - -For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`. - -```{code-cell} ipython3 -design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats) -for D in design.build_sequence(Carseats): - print(D.columns) -``` - -```{code-cell} ipython3 -sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) )) -``` - -```{code-cell} ipython3 -%%R -anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats)) -``` - -Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of -interaction as `R` does: - -```{code-cell} ipython3 -design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats) -sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) )) -``` - -```{code-cell} ipython3 -%%R -anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)) -``` - -To agree with `R` we must order `terms` as `R` will. - -```{code-cell} ipython3 -design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats) -sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats))) -``` - -## More complicated interactions - -Can we have an interaction of a polynomial effect with a categorical? Absolutely - -```{code-cell} ipython3 -%%R -anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats)) -``` - -To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels -for the two interactions. - -```{code-cell} ipython3 -p3 = poly('Income', 3) -design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats) -X = design.transform(Carseats) -sm.OLS(Y, X).fit().params -``` - -```{code-cell} ipython3 -sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats))) -``` - -## Grouping columns for ANOVA - -The `Variable` construct can be used to group -variables together to get custom sequences of models for `anova_lm`. - -```{code-cell} ipython3 -group1 = Variable(('Price', pref_encoding), 'group1', None) -group2 = Variable(('US', 'Advertising'), 'group2', None) -design = ModelSpec([group1, group2]).fit(Carseats) -for D in design.build_sequence(Carseats): - print(D.columns) -``` - -```{code-cell} ipython3 -sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats))) -``` - -It is not clear this is simple to do in `R` as the formula object expands all parentheses. - -```{code-cell} ipython3 -%%R -anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats)) -``` - -It can be done by building up the models -by hand and likely is possible to be done programmatically but it seems not obvious. - -```{code-cell} ipython3 -%%R -M1 = lm(Sales ~ 1, data=Carseats) -M2 = lm(Sales ~ Price + UIncome, data=Carseats) -M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats) -anova(M1, M2, M3) -``` - -## Alternative anova - -Another common ANOVA table involves dropping each term in succession from the model and comparing -to the full model. - -```{code-cell} ipython3 -Dfull = design.transform(Carseats) -Mfull = sm.OLS(Y, Dfull).fit() -for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')): - if i == 0: - D0 = D - print(set(D.columns) ^ set(Dfull.columns)) - print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull)) -``` - -```{code-cell} ipython3 -%%R -M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats) -M2 = lm(Sales ~ US + Advertising, data=Carseats) -print(anova(M2, M1)) -M3 = lm(Sales ~ Price + UIncome, data=Carseats) -print(anova(M3, M1)) -``` - -The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection -of the formula. - -```{code-cell} ipython3 -%%R -M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats) -M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats) -print(anova(M4, M1)) -``` - -It can be found with `summary`. - -```{code-cell} ipython3 -%%R -summary(M1) -``` - -```{code-cell} ipython3 -378.690726, 19.46**2 -``` - -```{code-cell} ipython3 - -``` diff --git a/docs/jupyterbook/transforms/PCA.ipynb b/docs/jupyterbook/transforms/PCA.ipynb index d8b41f3..ec1e0ae 100644 --- a/docs/jupyterbook/transforms/PCA.ipynb +++ b/docs/jupyterbook/transforms/PCA.ipynb @@ -19,9 +19,14 @@ "outputs": [], "source": [ "import numpy as np\n", + "from sklearn.decomposition import PCA\n", + "\n", "from ISLP import load_data\n", - "from ISLP.models import ModelSpec, pca, Variable, derived_variable\n", - "from sklearn.decomposition import PCA" + "from ISLP.models import (ModelSpec, \n", + " pca, \n", + " Feature, \n", + " derived_feature,\n", + " build_columns)" ] }, { @@ -71,7 +76,7 @@ "id": "fff603bf", "metadata": {}, "source": [ - "Suppose we want to make a `Variable` representing the first 3 principal components of the\n", + "Suppose we want to make a `Feature` representing the first 3 principal components of the\n", " features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`." ] }, @@ -80,8 +85,8 @@ "id": "eab49ad1-3957-478f-8a76-28a8f58551e9", "metadata": {}, "source": [ - "We first make a `Variable` that represents these five features columns, then `pca`\n", - "can be used to compute a new `Variable` that returns the first three principal components." + "We first make a `Feature` that represents these five features columns, then `pca`\n", + "can be used to compute a new `Feature` that returns the first three principal components." ] }, { @@ -91,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n", + "grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n", "sklearn_pca = PCA(n_components=3, whiten=True)" ] }, @@ -100,7 +105,7 @@ "id": "b45655a3-393d-4b4c-b754-cda61ed0e014", "metadata": {}, "source": [ - "We can now fit `sklearn_pca` and create our new variable." + "We can now fit `sklearn_pca` and create our new feature." ] }, { @@ -108,175 +113,18 @@ "execution_count": 5, "id": "6cfe8861-ad07-47b9-95d1-5d5513ff6fbe", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "sklearn_pca.fit(design.build_columns(Carseats, grouped)[0]) \n", - "pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n", + "grouped_features = build_columns(design.column_info_,\n", + " Carseats,\n", + " grouped)[0]\n", + "sklearn_pca.fit(grouped_features) \n", + "pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n", " name='pca(grouped)', encoder=sklearn_pca)\n", - "derived_features, _ = design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "aeb47184-9e15-4a6e-b60a-916f5ff89063", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CompPriceIncomeAdvertisingPopulationPrice
01387311276120
1111481626083
2113351026980
3117100446697
4141643340128
..................
39513810817203128
39613923337120
3971622612368159
39810079728495
39913437027120
\n", - "

400 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " CompPrice Income Advertising Population Price\n", - "0 138 73 11 276 120\n", - "1 111 48 16 260 83\n", - "2 113 35 10 269 80\n", - "3 117 100 4 466 97\n", - "4 141 64 3 340 128\n", - ".. ... ... ... ... ...\n", - "395 138 108 17 203 128\n", - "396 139 23 3 37 120\n", - "397 162 26 12 368 159\n", - "398 100 79 7 284 95\n", - "399 134 37 0 27 120\n", - "\n", - "[400 rows x 5 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, grouped)[0]" + "derived_features, _ = build_columns(design.column_info_,\n", + " Carseats, \n", + " pca_var,\n", + " encoders=design.encoders_)" ] }, { @@ -291,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "9f4b0955", "metadata": {}, "outputs": [], @@ -304,22 +152,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6b382699-eb86-457f-8e91-09a63eb21d49", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ @@ -329,7 +165,7 @@ " dtype='object')" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -350,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "4a8d9b28", "metadata": {}, "outputs": [], @@ -361,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "6efa6c67-86e1-4f51-86c2-25c838a90bf4", "metadata": {}, "outputs": [ @@ -371,7 +207,7 @@ "(4.073428490498941e-14, 0.0)" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -386,9 +222,9 @@ "formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/jupyterbook/transforms/PCA.md b/docs/jupyterbook/transforms/PCA.md index b9ba769..6b1a77f 100644 --- a/docs/jupyterbook/transforms/PCA.md +++ b/docs/jupyterbook/transforms/PCA.md @@ -5,11 +5,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Derived features: using PCA on a subset of columns @@ -19,9 +19,14 @@ construction of transformers applied to features. ```{code-cell} ipython3 import numpy as np -from ISLP import load_data -from ISLP.models import ModelSpec, pca, Variable, derived_variable from sklearn.decomposition import PCA + +from ISLP import load_data +from ISLP.models import (ModelSpec, + pca, + Feature, + derived_feature, + build_columns) ``` ```{code-cell} ipython3 @@ -35,30 +40,32 @@ Let's create a `ModelSpec` that is aware of all of the relevant columns. design = ModelSpec(Carseats.columns.drop(['Sales'])).fit(Carseats) ``` -Suppose we want to make a `Variable` representing the first 3 principal components of the +Suppose we want to make a `Feature` representing the first 3 principal components of the features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`. +++ -We first make a `Variable` that represents these five features columns, then `pca` -can be used to compute a new `Variable` that returns the first three principal components. +We first make a `Feature` that represents these five features columns, then `pca` +can be used to compute a new `Feature` that returns the first three principal components. ```{code-cell} ipython3 -grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None) +grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None) sklearn_pca = PCA(n_components=3, whiten=True) ``` -We can now fit `sklearn_pca` and create our new variable. +We can now fit `sklearn_pca` and create our new feature. ```{code-cell} ipython3 -sklearn_pca.fit(design.build_columns(Carseats, grouped)[0]) -pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'], +grouped_features = build_columns(design.column_info_, + Carseats, + grouped)[0] +sklearn_pca.fit(grouped_features) +pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'], name='pca(grouped)', encoder=sklearn_pca) -derived_features, _ = design.build_columns(Carseats, pca_var) -``` - -```{code-cell} ipython3 -design.build_columns(Carseats, grouped)[0] +derived_features, _ = build_columns(design.column_info_, + Carseats, + pca_var, + encoders=design.encoders_) ``` ## Helper function diff --git a/docs/jupyterbook/transforms/poly.ipynb b/docs/jupyterbook/transforms/poly.ipynb index 54d7b4e..45c862e 100644 --- a/docs/jupyterbook/transforms/poly.ipynb +++ b/docs/jupyterbook/transforms/poly.ipynb @@ -168,7 +168,7 @@ "source": [ "## Underlying model\n", "\n", - "If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns\n", + "If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns\n", "in a design matrix when it is a term used in creating the `ModelSpec`.\n", "\n", "Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column\n", @@ -319,9 +319,9 @@ "formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/jupyterbook/transforms/poly.md b/docs/jupyterbook/transforms/poly.md index 45e0e3d..e5aef11 100644 --- a/docs/jupyterbook/transforms/poly.md +++ b/docs/jupyterbook/transforms/poly.md @@ -5,11 +5,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Polynomial features @@ -66,7 +66,7 @@ np.linalg.norm(ISLP_features - R_features) ## Underlying model -If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns +If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns in a design matrix when it is a term used in creating the `ModelSpec`. Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column diff --git a/docs/jupyterbook/transforms/splines.ipynb b/docs/jupyterbook/transforms/splines.ipynb index f28d786..399b0be 100644 --- a/docs/jupyterbook/transforms/splines.ipynb +++ b/docs/jupyterbook/transforms/splines.ipynb @@ -310,9 +310,9 @@ "formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb" }, "kernelspec": { - "display_name": "islp_test", + "display_name": "python3", "language": "python", - "name": "islp_test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/jupyterbook/transforms/splines.md b/docs/jupyterbook/transforms/splines.md index f14bc17..de0ee3d 100644 --- a/docs/jupyterbook/transforms/splines.md +++ b/docs/jupyterbook/transforms/splines.md @@ -5,11 +5,11 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.1 + jupytext_version: 1.14.5 kernelspec: - display_name: islp_test + display_name: python3 language: python - name: islp_test + name: python3 --- # Spline features diff --git a/docs/make_notebooks.py b/docs/make_notebooks.py new file mode 100644 index 0000000..cfea244 --- /dev/null +++ b/docs/make_notebooks.py @@ -0,0 +1,107 @@ +''' +Run notebooks in an isolated environment specified by a requirements.txt file +''' + +from hashlib import md5 +import tempfile +import os +from argparse import ArgumentParser + + +parser = ArgumentParser() +parser.add_argument('--requirements', + default='requirements.txt') +parser.add_argument('labs', + metavar='N', + type=str, + nargs='+') +parser.add_argument('--python', + default='3.10') +parser.add_argument('--tarball', + default=None, + dest='tarball') +parser.add_argument('--inplace', + default=False, + action='store_true', + help='run notebooks in place?') +parser.add_argument('--timeout', + default=5000, + help='preprocessor timeout') +parser.add_argument('--env_tag', + default='') + +def make_notebooks(requirements='requirements.txt', + srcs=[], + dests=[], + tarball='', + inplace=False, + tmpdir='', + python='3.10', + timeout=5000, # should be enough for Ch10 + env_tag='', + ): + + if tarball and inplace: + raise ValueError('tarball option expects notebooks in a tmpdir, while inplace does not copy to a tmpdir') + + md5_ = md5() + md5_.update(open(requirements, 'rb').read()); + hash_ = md5_.hexdigest()[:8] + + env_name = f'isolated_env_{hash_}' + env_tag + + setup_cmd = f''' + conda create -n {env_name} python={python} -y; + conda run -n {env_name} pip install -r {requirements} jupyter jupytext; + ''' + + print(setup_cmd) + os.system(setup_cmd) + + # may need to up "ulimit -n 4096" + archive_files = [] + for src_, dest_ in zip(srcs, dests): + if src_ != dest_: + os.system(f'cp {src_} {dest_}') + name = os.path.split(dest_)[1] + build_cmd = f'''conda run -n {env_name} jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout={timeout} {dest_} ''' + if '02' in name: + build_cmd += ' --allow-errors ' + + print(build_cmd) + os.system(build_cmd) + archive_files.append(name) + + archive_files = ' '.join(archive_files) + + if tarball: + tarball = os.path.abspath(tarball) + tarball_cmd = f''' + cd {tmpdir}; tar -cvzf {tarball} {archive_files} + ''' + print(tarball_cmd) + os.system(tarball_cmd) + + os.system(f'conda env remove -n {env_name}') + +if __name__ == '__main__': + + args = parser.parse_args() + srcs = [os.path.abspath(l) for l in args.labs] + + tmpdir = tempfile.mkdtemp() + + if args.inplace: + dests = srcs + else: + dests = [os.path.join(tmpdir, os.path.split(l)[1]) for l in args.labs] + + make_notebooks(requirements=os.path.abspath(args.requirements), + srcs=srcs, + dests=dests, + inplace=args.inplace, + tmpdir=tmpdir, + python=args.python, + tarball=args.tarball, + timeout=args.timeout, + env_tag=args.env_tag) diff --git a/docs/requirements.txt b/docs/requirements.txt index 68ef4bc..10bce0e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,7 @@ texext numpydoc myst_nb +sphinx-book-theme +rpy2 +sphinx_rtd_theme +jupytext diff --git a/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb b/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb deleted file mode 100644 index c78ca44..0000000 --- a/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3eff5ba8", - "metadata": {}, - "source": [ - "# Creating a clean IMDB dataset\n", - "\n", - "Running this example requires `keras`. Use `pip install keras` to install if necessary." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "53925437", - "metadata": {}, - "outputs": [], - "source": [ - "import pickle" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a855c7c0", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from scipy.sparse import coo_matrix, save_npz\n", - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe16fa84", - "metadata": {}, - "outputs": [], - "source": [ - "from keras.datasets import imdb\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "0369a36a", - "metadata": {}, - "outputs": [], - "source": [ - "# the 3 is for three terms: \n", - "num_words = 10000+3\n", - "((S_train, Y_train), \n", - " (S_test, Y_test)) = imdb.load_data(num_words=num_words)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9e84d7e3", - "metadata": {}, - "outputs": [], - "source": [ - "Y_train = Y_train.astype(np.float32)\n", - "Y_test = Y_test.astype(np.float32)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1a737737", - "metadata": {}, - "outputs": [], - "source": [ - "def one_hot(sequences, ncol):\n", - " idx, vals = [], []\n", - " for i, s in enumerate(sequences):\n", - " idx.extend({(i,v):1 for v in s}.keys())\n", - " idx = np.array(idx).T\n", - " vals = np.ones(idx.shape[1], dtype=np.float32)\n", - " tens = torch.sparse_coo_tensor(indices=idx,\n", - " values=vals,\n", - " size=(len(sequences), ncol))\n", - " return tens.coalesce()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f08ad327", - "metadata": {}, - "outputs": [], - "source": [ - "X_train, L_train = one_hot(S_train, num_words), Y_train\n", - "X_test = one_hot(S_test, num_words)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "98481bbb", - "metadata": {}, - "outputs": [], - "source": [ - "def convert_sparse_tensor(X):\n", - " idx = np.asarray(X.indices())\n", - " vals = np.asarray(X.values())\n", - " return coo_matrix((vals,\n", - " (idx[0],\n", - " idx[1])),\n", - " shape=X.shape).tocsr()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "5a17bd62", - "metadata": {}, - "outputs": [], - "source": [ - "X_train_s = convert_sparse_tensor(X_train)\n", - "X_test_s = convert_sparse_tensor(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ca57aea4", - "metadata": {}, - "outputs": [], - "source": [ - "X_train_d = torch.tensor(X_train_s.todense())\n", - "X_test_d = torch.tensor(X_test_s.todense())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3d017780", - "metadata": {}, - "outputs": [], - "source": [ - "torch.save(X_train_d, 'IMDB_X_train.tensor')\n", - "torch.save(X_test_d, 'IMDB_X_test.tensor')" - ] - }, - { - "cell_type": "markdown", - "id": "f9bb0163", - "metadata": {}, - "source": [ - "save the sparse matrices" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "23afd3e5", - "metadata": {}, - "outputs": [], - "source": [ - "save_npz('IMDB_X_test.npz', X_test_s)\n", - "save_npz('IMDB_X_train.npz', X_train_s)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d33568d1", - "metadata": {}, - "outputs": [], - "source": [ - "np.save('IMDB_Y_test.npy', Y_test)\n", - "np.save('IMDB_Y_train.npy', L_train)" - ] - }, - { - "cell_type": "markdown", - "id": "f9110984", - "metadata": {}, - "source": [ - "save and pickle the word index" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "ff44a0b4", - "metadata": {}, - "outputs": [], - "source": [ - "word_index = imdb.get_word_index()\n", - "lookup = {(i+3):w for w, i in word_index.items()}\n", - "lookup[0] = \"\"\n", - "lookup[1] = \"\"\n", - "lookup[2] = \"\"\n", - "lookup[4] = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "1486c640", - "metadata": {}, - "outputs": [], - "source": [ - "pickle.dump(lookup, open('IMDB_word_index.pkl', 'bw'))" - ] - }, - { - "cell_type": "markdown", - "id": "57e606c5", - "metadata": {}, - "source": [ - "create the padded representations" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3ab7a4ac", - "metadata": {}, - "outputs": [], - "source": [ - "(S_train,\n", - " S_test) = [torch.tensor(pad_sequences(S, maxlen=500, value=0))\n", - " for S in [S_train,\n", - " S_test]]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "55cb2d49", - "metadata": {}, - "outputs": [], - "source": [ - "torch.save(S_train, 'IMDB_S_train.tensor')\n", - "torch.save(S_test, 'IMDB_S_test.tensor')" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "formats": "py:percent,ipynb,md:myst", - "main_language": "python" - }, - "kernelspec": { - "display_name": "islp_test", - "language": "python", - "name": "islp_test" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/api/gen.rst b/docs/source/api/gen.rst index 2539220..fb3bec5 100644 --- a/docs/source/api/gen.rst +++ b/docs/source/api/gen.rst @@ -6,7 +6,6 @@ generated/ISLP.bart.bart generated/ISLP.bart.likelihood generated/ISLP.bart.particle_tree - generated/ISLP.bart.tmpbart generated/ISLP.bart.tree generated/ISLP.cluster generated/ISLP.models diff --git a/docs/source/api/generated/ISLP.bart.tmpbart.rst b/docs/source/api/generated/ISLP.bart.tmpbart.rst deleted file mode 100644 index b72117a..0000000 --- a/docs/source/api/generated/ISLP.bart.tmpbart.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. AUTO-GENERATED FILE -- DO NOT EDIT! - -bart.tmpbart -============ - -Module: :mod:`bart.tmpbart` ---------------------------- -Inheritance diagram for ``ISLP.bart.tmpbart``: - -.. inheritance-diagram:: ISLP.bart.tmpbart - :parts: 3 - -.. automodule:: ISLP.bart.tmpbart - -.. currentmodule:: ISLP.bart.tmpbart - -Classes -------- - -:class:`BART` -~~~~~~~~~~~~~ - - -.. autoclass:: BART - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - - .. automethod:: __init__ - -:class:`SampleSplittingVariable` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -.. autoclass:: SampleSplittingVariable - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - - .. automethod:: __init__ diff --git a/docs/source/api/generated/ISLP.models.model_spec.rst b/docs/source/api/generated/ISLP.models.model_spec.rst index c379253..d457e3a 100644 --- a/docs/source/api/generated/ISLP.models.model_spec.rst +++ b/docs/source/api/generated/ISLP.models.model_spec.rst @@ -29,11 +29,11 @@ Classes .. automethod:: __init__ -:class:`ModelSpec` -~~~~~~~~~~~~~~~~~~ +:class:`Feature` +~~~~~~~~~~~~~~~~ -.. autoclass:: ModelSpec +.. autoclass:: Feature :members: :undoc-members: :show-inheritance: @@ -41,11 +41,11 @@ Classes .. automethod:: __init__ -:class:`Variable` -~~~~~~~~~~~~~~~~~ +:class:`ModelSpec` +~~~~~~~~~~~~~~~~~~ -.. autoclass:: Variable +.. autoclass:: ModelSpec :members: :undoc-members: :show-inheritance: @@ -63,10 +63,13 @@ Functions .. autofunction:: ISLP.models.model_spec.build_columns +.. autofunction:: ISLP.models.model_spec.build_model + + .. autofunction:: ISLP.models.model_spec.contrast -.. autofunction:: ISLP.models.model_spec.derived_variable +.. autofunction:: ISLP.models.model_spec.derived_feature .. autofunction:: ISLP.models.model_spec.fit_encoder diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 4734cda..8aededd 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -1,12 +1,7 @@ -.. _api-index: +ISLP reference +-------------- -##### - API -##### -.. only:: html +.. toctree:: - :Release: |version| - :Date: |today| - -.. include:: gen.rst + gen diff --git a/docs/source/conf.py b/docs/source/conf.py index 5da3dda..546d74f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -2,12 +2,32 @@ # -- Project information +import json +import os + project = 'ISLP' copyright = '2023, ISLP authors' author = 'Jonathan Taylor' -release = '0.1' -version = '0.1.0' +import ISLP +version = ISLP.__version__ + +import __main__ +dirname = os.path.split(__file__)[0] +print(dirname, 'dirname') + +docs_version = json.loads(open(os.path.join(dirname, 'docs_version.json')).read()) +lab_version = docs_version['labs'] + +myst_enable_extensions = ['substitution'] + +myst_substitutions = { + "ISLP_lab_link": f"[ISLP_labs/{lab_version}](https://github.com/intro-stat-learning/ISLP_labs/tree/{lab_version})", + "ISLP_zip_link": f"[ISLP_labs/{lab_version}.zip](https://github.com/intro-stat-learning/ISLP_labs/archive/refs/tags/{lab_version}.zip)", + "ISLP_binder_code": f"[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/{lab_version})", + "ISLP_lab_version": "[ISLP/{0}](https://github.com/intro-stat-learning/ISLP/tree/{0})".format(docs_version['library']) + } +myst_number_code_blocks = ['python', 'ipython3'] # -- General configuration @@ -27,7 +47,16 @@ graphviz_dot = '/opt/homebrew/bin/dot' numpydoc_class_members_toctree = False -nb_execution_mode = "cache" +nb_execution_mode = "auto" +nb_execution_timeout = 60*20 #*100 +# labs will be built with specific commits of ISLP/ISLP_labs +# we want Ch06 run to exlucde the warnings +nb_execution_excludepatterns = (['imdb.ipynb'] + + [f'Ch{i:02d}*' for i in range(2, 14)]) +print('exclude patterns', nb_execution_excludepatterns) +nb_execution_allow_errors = True + +#nb_kernel_rgx_aliases = {'python3': "islp_test"} intersphinx_mapping = { 'python': ('https://docs.python.org/3/', None), @@ -42,7 +71,19 @@ # -- Options for HTML output -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_book_theme" +html_theme_options = { + "repository_url": "https://github.com/intro-stat-learning/ISLP.git", + "use_repository_button": True, +} +html_title = "Introduction to Statistical Learning (Python)" +html_logo = "logo.png" + +source_suffix = { + '.rst': 'restructuredtext', + '.ipynb': 'myst-nb', + '.myst': 'myst-nb', +} # -- Options for EPUB output epub_show_urls = 'footnote' diff --git a/docs/source/datasets/Auto.ipynb b/docs/source/datasets/Auto.ipynb index b88ea02..b588844 100644 --- a/docs/source/datasets/Auto.ipynb +++ b/docs/source/datasets/Auto.ipynb @@ -44,7 +44,14 @@ "cell_type": "code", "execution_count": null, "id": "182ea1d1", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:43.883548Z", + "iopub.status.busy": "2023-07-26T12:47:43.883261Z", + "iopub.status.idle": "2023-07-26T12:47:44.433075Z", + "shell.execute_reply": "2023-07-26T12:47:44.432801Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -56,7 +63,14 @@ "cell_type": "code", "execution_count": null, "id": "979abd7e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:44.434662Z", + "iopub.status.busy": "2023-07-26T12:47:44.434558Z", + "iopub.status.idle": "2023-07-26T12:47:44.436577Z", + "shell.execute_reply": "2023-07-26T12:47:44.436322Z" + } + }, "outputs": [], "source": [ "Auto.shape" @@ -66,7 +80,14 @@ "cell_type": "code", "execution_count": null, "id": "7444c0f0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:44.438047Z", + "iopub.status.busy": "2023-07-26T12:47:44.437943Z", + "iopub.status.idle": "2023-07-26T12:47:44.439951Z", + "shell.execute_reply": "2023-07-26T12:47:44.439712Z" + } + }, "outputs": [], "source": [ "Auto.columns" @@ -76,7 +97,14 @@ "cell_type": "code", "execution_count": null, "id": "59b6e919", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:44.441257Z", + "iopub.status.busy": "2023-07-26T12:47:44.441161Z", + "iopub.status.idle": "2023-07-26T12:47:44.449658Z", + "shell.execute_reply": "2023-07-26T12:47:44.449426Z" + } + }, "outputs": [], "source": [ "Auto.describe().iloc[:,:4]" @@ -91,6 +119,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Bikeshare.ipynb b/docs/source/datasets/Bikeshare.ipynb index ddb1053..ab42024 100644 --- a/docs/source/datasets/Bikeshare.ipynb +++ b/docs/source/datasets/Bikeshare.ipynb @@ -56,7 +56,14 @@ "cell_type": "code", "execution_count": null, "id": "bcdb89b6", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:19.462730Z", + "iopub.status.busy": "2023-07-26T12:47:19.461535Z", + "iopub.status.idle": "2023-07-26T12:47:20.022610Z", + "shell.execute_reply": "2023-07-26T12:47:20.022326Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -68,7 +75,14 @@ "cell_type": "code", "execution_count": null, "id": "72075fb0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:20.024144Z", + "iopub.status.busy": "2023-07-26T12:47:20.024034Z", + "iopub.status.idle": "2023-07-26T12:47:20.026016Z", + "shell.execute_reply": "2023-07-26T12:47:20.025777Z" + } + }, "outputs": [], "source": [ "Bikeshare.shape" @@ -78,7 +92,14 @@ "cell_type": "code", "execution_count": null, "id": "45396d69", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:20.027480Z", + "iopub.status.busy": "2023-07-26T12:47:20.027378Z", + "iopub.status.idle": "2023-07-26T12:47:20.029427Z", + "shell.execute_reply": "2023-07-26T12:47:20.029199Z" + } + }, "outputs": [], "source": [ "Bikeshare.columns" @@ -88,7 +109,14 @@ "cell_type": "code", "execution_count": null, "id": "26c24d9a", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:20.030734Z", + "iopub.status.busy": "2023-07-26T12:47:20.030638Z", + "iopub.status.idle": "2023-07-26T12:47:20.042031Z", + "shell.execute_reply": "2023-07-26T12:47:20.041787Z" + } + }, "outputs": [], "source": [ "Bikeshare.describe().iloc[:,:4]" @@ -105,6 +133,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Boston.ipynb b/docs/source/datasets/Boston.ipynb index 569f5b4..027585a 100644 --- a/docs/source/datasets/Boston.ipynb +++ b/docs/source/datasets/Boston.ipynb @@ -49,7 +49,14 @@ "cell_type": "code", "execution_count": null, "id": "b8bb96f0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:31.625524Z", + "iopub.status.busy": "2023-07-26T12:47:31.625196Z", + "iopub.status.idle": "2023-07-26T12:47:32.177553Z", + "shell.execute_reply": "2023-07-26T12:47:32.177240Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -61,7 +68,14 @@ "cell_type": "code", "execution_count": null, "id": "ab4b03f8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:32.179272Z", + "iopub.status.busy": "2023-07-26T12:47:32.179157Z", + "iopub.status.idle": "2023-07-26T12:47:32.181230Z", + "shell.execute_reply": "2023-07-26T12:47:32.180964Z" + } + }, "outputs": [], "source": [ "Boston.shape" @@ -71,7 +85,14 @@ "cell_type": "code", "execution_count": null, "id": "74890e1f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:32.182653Z", + "iopub.status.busy": "2023-07-26T12:47:32.182557Z", + "iopub.status.idle": "2023-07-26T12:47:32.184501Z", + "shell.execute_reply": "2023-07-26T12:47:32.184276Z" + } + }, "outputs": [], "source": [ "Boston.columns" @@ -81,7 +102,14 @@ "cell_type": "code", "execution_count": null, "id": "90ecf46f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:32.185826Z", + "iopub.status.busy": "2023-07-26T12:47:32.185735Z", + "iopub.status.idle": "2023-07-26T12:47:32.198310Z", + "shell.execute_reply": "2023-07-26T12:47:32.198074Z" + } + }, "outputs": [], "source": [ "Boston.describe()" @@ -98,6 +126,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/BrainCancer.ipynb b/docs/source/datasets/BrainCancer.ipynb index cb75946..89e8b2c 100644 --- a/docs/source/datasets/BrainCancer.ipynb +++ b/docs/source/datasets/BrainCancer.ipynb @@ -39,7 +39,14 @@ "cell_type": "code", "execution_count": null, "id": "519fa8cf", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:09.619445Z", + "iopub.status.busy": "2023-07-26T12:47:09.618768Z", + "iopub.status.idle": "2023-07-26T12:47:10.149955Z", + "shell.execute_reply": "2023-07-26T12:47:10.149508Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -51,7 +58,14 @@ "cell_type": "code", "execution_count": null, "id": "ac7f1920", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:10.151658Z", + "iopub.status.busy": "2023-07-26T12:47:10.151541Z", + "iopub.status.idle": "2023-07-26T12:47:10.153944Z", + "shell.execute_reply": "2023-07-26T12:47:10.153658Z" + } + }, "outputs": [], "source": [ "BrainCancer.shape" @@ -61,7 +75,14 @@ "cell_type": "code", "execution_count": null, "id": "64b3177f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:10.155433Z", + "iopub.status.busy": "2023-07-26T12:47:10.155323Z", + "iopub.status.idle": "2023-07-26T12:47:10.157819Z", + "shell.execute_reply": "2023-07-26T12:47:10.157458Z" + } + }, "outputs": [], "source": [ "BrainCancer.columns" @@ -71,7 +92,14 @@ "cell_type": "code", "execution_count": null, "id": "8132496d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:10.159542Z", + "iopub.status.busy": "2023-07-26T12:47:10.159420Z", + "iopub.status.idle": "2023-07-26T12:47:10.166890Z", + "shell.execute_reply": "2023-07-26T12:47:10.166610Z" + } + }, "outputs": [], "source": [ "BrainCancer.describe()" @@ -81,7 +109,14 @@ "cell_type": "code", "execution_count": null, "id": "ed04719d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:10.168420Z", + "iopub.status.busy": "2023-07-26T12:47:10.168324Z", + "iopub.status.idle": "2023-07-26T12:47:10.171157Z", + "shell.execute_reply": "2023-07-26T12:47:10.170862Z" + } + }, "outputs": [], "source": [ "BrainCancer['diagnosis'].value_counts()" @@ -98,6 +133,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Caravan.ipynb b/docs/source/datasets/Caravan.ipynb index f093422..ab39457 100644 --- a/docs/source/datasets/Caravan.ipynb +++ b/docs/source/datasets/Caravan.ipynb @@ -27,7 +27,14 @@ "cell_type": "code", "execution_count": null, "id": "1f9a6aaa", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:12.041705Z", + "iopub.status.busy": "2023-07-26T12:47:12.040979Z", + "iopub.status.idle": "2023-07-26T12:47:12.637566Z", + "shell.execute_reply": "2023-07-26T12:47:12.637297Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -39,7 +46,14 @@ "cell_type": "code", "execution_count": null, "id": "88755969", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:12.639146Z", + "iopub.status.busy": "2023-07-26T12:47:12.639031Z", + "iopub.status.idle": "2023-07-26T12:47:12.640881Z", + "shell.execute_reply": "2023-07-26T12:47:12.640666Z" + } + }, "outputs": [], "source": [ "Caravan.shape" @@ -49,7 +63,14 @@ "cell_type": "code", "execution_count": null, "id": "52ea2641", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:12.642281Z", + "iopub.status.busy": "2023-07-26T12:47:12.642186Z", + "iopub.status.idle": "2023-07-26T12:47:12.644243Z", + "shell.execute_reply": "2023-07-26T12:47:12.644020Z" + } + }, "outputs": [], "source": [ "Caravan.columns[:20]" @@ -66,6 +87,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Carseats.ipynb b/docs/source/datasets/Carseats.ipynb index dfd36d4..92ff1b4 100644 --- a/docs/source/datasets/Carseats.ipynb +++ b/docs/source/datasets/Carseats.ipynb @@ -37,7 +37,14 @@ "cell_type": "code", "execution_count": null, "id": "984643c9", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:26.781289Z", + "iopub.status.busy": "2023-07-26T12:47:26.780964Z", + "iopub.status.idle": "2023-07-26T12:47:27.314225Z", + "shell.execute_reply": "2023-07-26T12:47:27.313885Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -49,7 +56,14 @@ "cell_type": "code", "execution_count": null, "id": "663f5f6a", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:27.316055Z", + "iopub.status.busy": "2023-07-26T12:47:27.315854Z", + "iopub.status.idle": "2023-07-26T12:47:27.318176Z", + "shell.execute_reply": "2023-07-26T12:47:27.317912Z" + } + }, "outputs": [], "source": [ "Carseats.shape" @@ -59,7 +73,14 @@ "cell_type": "code", "execution_count": null, "id": "386299b2", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:27.319606Z", + "iopub.status.busy": "2023-07-26T12:47:27.319504Z", + "iopub.status.idle": "2023-07-26T12:47:27.321648Z", + "shell.execute_reply": "2023-07-26T12:47:27.321403Z" + } + }, "outputs": [], "source": [ "Carseats.columns" @@ -69,7 +90,14 @@ "cell_type": "code", "execution_count": null, "id": "5c8c69c8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:27.323205Z", + "iopub.status.busy": "2023-07-26T12:47:27.323091Z", + "iopub.status.idle": "2023-07-26T12:47:27.331921Z", + "shell.execute_reply": "2023-07-26T12:47:27.331627Z" + } + }, "outputs": [], "source": [ "Carseats.describe().iloc[:,:4]" @@ -86,6 +114,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/College.ipynb b/docs/source/datasets/College.ipynb index af1027d..27a4d1d 100644 --- a/docs/source/datasets/College.ipynb +++ b/docs/source/datasets/College.ipynb @@ -58,7 +58,14 @@ "cell_type": "code", "execution_count": null, "id": "680ceb3e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:17.006699Z", + "iopub.status.busy": "2023-07-26T12:47:17.006226Z", + "iopub.status.idle": "2023-07-26T12:47:17.561114Z", + "shell.execute_reply": "2023-07-26T12:47:17.560739Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -70,7 +77,14 @@ "cell_type": "code", "execution_count": null, "id": "ccdf3e4f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:17.563075Z", + "iopub.status.busy": "2023-07-26T12:47:17.562947Z", + "iopub.status.idle": "2023-07-26T12:47:17.565074Z", + "shell.execute_reply": "2023-07-26T12:47:17.564824Z" + } + }, "outputs": [], "source": [ "College.shape" @@ -80,7 +94,14 @@ "cell_type": "code", "execution_count": null, "id": "09f59747", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:17.566389Z", + "iopub.status.busy": "2023-07-26T12:47:17.566297Z", + "iopub.status.idle": "2023-07-26T12:47:17.568257Z", + "shell.execute_reply": "2023-07-26T12:47:17.568025Z" + } + }, "outputs": [], "source": [ "College.columns" @@ -90,7 +111,14 @@ "cell_type": "code", "execution_count": null, "id": "6a48dfd5", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:17.569585Z", + "iopub.status.busy": "2023-07-26T12:47:17.569492Z", + "iopub.status.idle": "2023-07-26T12:47:17.582384Z", + "shell.execute_reply": "2023-07-26T12:47:17.582154Z" + } + }, "outputs": [], "source": [ "College.describe().iloc[:,:4]" @@ -107,6 +135,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Credit.ipynb b/docs/source/datasets/Credit.ipynb index f5e51a9..d604aaa 100644 --- a/docs/source/datasets/Credit.ipynb +++ b/docs/source/datasets/Credit.ipynb @@ -43,7 +43,14 @@ "cell_type": "code", "execution_count": null, "id": "c4895446", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:39.024610Z", + "iopub.status.busy": "2023-07-26T12:47:39.024341Z", + "iopub.status.idle": "2023-07-26T12:47:39.593395Z", + "shell.execute_reply": "2023-07-26T12:47:39.593133Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -55,7 +62,14 @@ "cell_type": "code", "execution_count": null, "id": "c738c66b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:39.595074Z", + "iopub.status.busy": "2023-07-26T12:47:39.594871Z", + "iopub.status.idle": "2023-07-26T12:47:39.596893Z", + "shell.execute_reply": "2023-07-26T12:47:39.596667Z" + } + }, "outputs": [], "source": [ "Credit.shape" @@ -65,7 +79,14 @@ "cell_type": "code", "execution_count": null, "id": "d612f5a7", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:39.598266Z", + "iopub.status.busy": "2023-07-26T12:47:39.598173Z", + "iopub.status.idle": "2023-07-26T12:47:39.600134Z", + "shell.execute_reply": "2023-07-26T12:47:39.599913Z" + } + }, "outputs": [], "source": [ "Credit.columns" @@ -75,7 +96,14 @@ "cell_type": "code", "execution_count": null, "id": "45633b1a", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:39.601442Z", + "iopub.status.busy": "2023-07-26T12:47:39.601344Z", + "iopub.status.idle": "2023-07-26T12:47:39.609927Z", + "shell.execute_reply": "2023-07-26T12:47:39.609656Z" + } + }, "outputs": [], "source": [ "Credit.describe().iloc[:,:4]" @@ -92,6 +120,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Default.ipynb b/docs/source/datasets/Default.ipynb index 64357ef..8023d39 100644 --- a/docs/source/datasets/Default.ipynb +++ b/docs/source/datasets/Default.ipynb @@ -27,7 +27,14 @@ "cell_type": "code", "execution_count": null, "id": "ab810dee", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:36.566964Z", + "iopub.status.busy": "2023-07-26T12:47:36.566691Z", + "iopub.status.idle": "2023-07-26T12:47:37.127499Z", + "shell.execute_reply": "2023-07-26T12:47:37.127183Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -39,7 +46,14 @@ "cell_type": "code", "execution_count": null, "id": "086ef3a2", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:37.129114Z", + "iopub.status.busy": "2023-07-26T12:47:37.129003Z", + "iopub.status.idle": "2023-07-26T12:47:37.131023Z", + "shell.execute_reply": "2023-07-26T12:47:37.130778Z" + } + }, "outputs": [], "source": [ "Default.shape" @@ -49,7 +63,14 @@ "cell_type": "code", "execution_count": null, "id": "6600c13b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:37.132471Z", + "iopub.status.busy": "2023-07-26T12:47:37.132373Z", + "iopub.status.idle": "2023-07-26T12:47:37.134281Z", + "shell.execute_reply": "2023-07-26T12:47:37.134067Z" + } + }, "outputs": [], "source": [ "Default.columns" @@ -59,7 +80,14 @@ "cell_type": "code", "execution_count": null, "id": "09e98840", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:37.135578Z", + "iopub.status.busy": "2023-07-26T12:47:37.135480Z", + "iopub.status.idle": "2023-07-26T12:47:37.141213Z", + "shell.execute_reply": "2023-07-26T12:47:37.140974Z" + } + }, "outputs": [], "source": [ "Default.describe()" @@ -69,7 +97,14 @@ "cell_type": "code", "execution_count": null, "id": "425f0cb1", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:37.142597Z", + "iopub.status.busy": "2023-07-26T12:47:37.142519Z", + "iopub.status.idle": "2023-07-26T12:47:37.145148Z", + "shell.execute_reply": "2023-07-26T12:47:37.144915Z" + } + }, "outputs": [], "source": [ "Default['student'].value_counts()" @@ -86,6 +121,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Fund.ipynb b/docs/source/datasets/Fund.ipynb index fce1859..2e5dcb5 100644 --- a/docs/source/datasets/Fund.ipynb +++ b/docs/source/datasets/Fund.ipynb @@ -15,7 +15,14 @@ "cell_type": "code", "execution_count": null, "id": "5eba8e49", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:59.809785Z", + "iopub.status.busy": "2023-07-26T12:46:59.809389Z", + "iopub.status.idle": "2023-07-26T12:47:00.410897Z", + "shell.execute_reply": "2023-07-26T12:47:00.410627Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -27,7 +34,14 @@ "cell_type": "code", "execution_count": null, "id": "ced3b335", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:00.412492Z", + "iopub.status.busy": "2023-07-26T12:47:00.412385Z", + "iopub.status.idle": "2023-07-26T12:47:00.414444Z", + "shell.execute_reply": "2023-07-26T12:47:00.414168Z" + } + }, "outputs": [], "source": [ "Fund.shape" @@ -37,7 +51,14 @@ "cell_type": "code", "execution_count": null, "id": "bfff1ac6", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:00.415891Z", + "iopub.status.busy": "2023-07-26T12:47:00.415789Z", + "iopub.status.idle": "2023-07-26T12:47:00.417755Z", + "shell.execute_reply": "2023-07-26T12:47:00.417529Z" + } + }, "outputs": [], "source": [ "Fund.columns" @@ -54,6 +75,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Hitters.ipynb b/docs/source/datasets/Hitters.ipynb index 6f261cd..5af634c 100644 --- a/docs/source/datasets/Hitters.ipynb +++ b/docs/source/datasets/Hitters.ipynb @@ -64,7 +64,14 @@ "cell_type": "code", "execution_count": null, "id": "4fa187f0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:34.072657Z", + "iopub.status.busy": "2023-07-26T12:47:34.072382Z", + "iopub.status.idle": "2023-07-26T12:47:34.654518Z", + "shell.execute_reply": "2023-07-26T12:47:34.654230Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -76,7 +83,14 @@ "cell_type": "code", "execution_count": null, "id": "04535ffb", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:34.656071Z", + "iopub.status.busy": "2023-07-26T12:47:34.655969Z", + "iopub.status.idle": "2023-07-26T12:47:34.657899Z", + "shell.execute_reply": "2023-07-26T12:47:34.657674Z" + } + }, "outputs": [], "source": [ "Hitters.shape" @@ -86,7 +100,14 @@ "cell_type": "code", "execution_count": null, "id": "6875aac6", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:34.659335Z", + "iopub.status.busy": "2023-07-26T12:47:34.659236Z", + "iopub.status.idle": "2023-07-26T12:47:34.661182Z", + "shell.execute_reply": "2023-07-26T12:47:34.660944Z" + } + }, "outputs": [], "source": [ "Hitters.columns" @@ -96,7 +117,14 @@ "cell_type": "code", "execution_count": null, "id": "9e2cffc8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:34.662645Z", + "iopub.status.busy": "2023-07-26T12:47:34.662543Z", + "iopub.status.idle": "2023-07-26T12:47:34.674958Z", + "shell.execute_reply": "2023-07-26T12:47:34.674698Z" + } + }, "outputs": [], "source": [ "Hitters.describe().iloc[:,:4]" @@ -113,6 +141,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Khan.ipynb b/docs/source/datasets/Khan.ipynb index f12a5ca..c1ce7bf 100644 --- a/docs/source/datasets/Khan.ipynb +++ b/docs/source/datasets/Khan.ipynb @@ -43,7 +43,14 @@ "cell_type": "code", "execution_count": null, "id": "bfda6cad", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:53.879692Z", + "iopub.status.busy": "2023-07-26T12:46:53.879072Z", + "iopub.status.idle": "2023-07-26T12:46:54.473904Z", + "shell.execute_reply": "2023-07-26T12:46:54.473562Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -55,7 +62,14 @@ "cell_type": "code", "execution_count": null, "id": "70514dc5", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:54.475443Z", + "iopub.status.busy": "2023-07-26T12:46:54.475340Z", + "iopub.status.idle": "2023-07-26T12:46:54.477103Z", + "shell.execute_reply": "2023-07-26T12:46:54.476883Z" + } + }, "outputs": [], "source": [ "for X in ['xtest', 'xtrain']:\n", @@ -66,7 +80,14 @@ "cell_type": "code", "execution_count": null, "id": "e9df5de8", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:54.478408Z", + "iopub.status.busy": "2023-07-26T12:46:54.478336Z", + "iopub.status.idle": "2023-07-26T12:46:54.480540Z", + "shell.execute_reply": "2023-07-26T12:46:54.480299Z" + } + }, "outputs": [], "source": [ "for Y in ['ytest', 'ytrain']:\n", @@ -84,6 +105,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/NCI60.ipynb b/docs/source/datasets/NCI60.ipynb index bbb576f..b38f981 100644 --- a/docs/source/datasets/NCI60.ipynb +++ b/docs/source/datasets/NCI60.ipynb @@ -26,7 +26,14 @@ "cell_type": "code", "execution_count": null, "id": "c88c2eaf", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:07.189429Z", + "iopub.status.busy": "2023-07-26T12:47:07.188891Z", + "iopub.status.idle": "2023-07-26T12:47:07.734853Z", + "shell.execute_reply": "2023-07-26T12:47:07.734392Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -38,7 +45,14 @@ "cell_type": "code", "execution_count": null, "id": "0e6279ad", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:07.736643Z", + "iopub.status.busy": "2023-07-26T12:47:07.736477Z", + "iopub.status.idle": "2023-07-26T12:47:07.740295Z", + "shell.execute_reply": "2023-07-26T12:47:07.739954Z" + } + }, "outputs": [], "source": [ "NCI60['labels'].value_counts()" @@ -48,7 +62,14 @@ "cell_type": "code", "execution_count": null, "id": "ed5ddd2f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:07.741963Z", + "iopub.status.busy": "2023-07-26T12:47:07.741866Z", + "iopub.status.idle": "2023-07-26T12:47:07.744496Z", + "shell.execute_reply": "2023-07-26T12:47:07.744146Z" + } + }, "outputs": [], "source": [ "NCI60['data'].shape" @@ -65,6 +86,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/NYSE.ipynb b/docs/source/datasets/NYSE.ipynb index 5f9dbd5..4fb6ea5 100644 --- a/docs/source/datasets/NYSE.ipynb +++ b/docs/source/datasets/NYSE.ipynb @@ -33,7 +33,14 @@ "cell_type": "code", "execution_count": null, "id": "fcff6c95", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:24.365935Z", + "iopub.status.busy": "2023-07-26T12:47:24.365648Z", + "iopub.status.idle": "2023-07-26T12:47:24.910157Z", + "shell.execute_reply": "2023-07-26T12:47:24.909886Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -45,7 +52,14 @@ "cell_type": "code", "execution_count": null, "id": "84426961", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:24.911976Z", + "iopub.status.busy": "2023-07-26T12:47:24.911859Z", + "iopub.status.idle": "2023-07-26T12:47:24.913899Z", + "shell.execute_reply": "2023-07-26T12:47:24.913685Z" + } + }, "outputs": [], "source": [ "NYSE.shape" @@ -55,7 +69,14 @@ "cell_type": "code", "execution_count": null, "id": "e6194a8c", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:24.915295Z", + "iopub.status.busy": "2023-07-26T12:47:24.915180Z", + "iopub.status.idle": "2023-07-26T12:47:24.917209Z", + "shell.execute_reply": "2023-07-26T12:47:24.916991Z" + } + }, "outputs": [], "source": [ "NYSE.columns" @@ -65,7 +86,14 @@ "cell_type": "code", "execution_count": null, "id": "0c7bf3d7", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:24.918571Z", + "iopub.status.busy": "2023-07-26T12:47:24.918468Z", + "iopub.status.idle": "2023-07-26T12:47:24.924914Z", + "shell.execute_reply": "2023-07-26T12:47:24.924671Z" + } + }, "outputs": [], "source": [ "NYSE.describe()" @@ -82,6 +110,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/OJ.ipynb b/docs/source/datasets/OJ.ipynb index e18a4de..55ffeb9 100644 --- a/docs/source/datasets/OJ.ipynb +++ b/docs/source/datasets/OJ.ipynb @@ -61,7 +61,14 @@ "cell_type": "code", "execution_count": null, "id": "609742da", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:14.553008Z", + "iopub.status.busy": "2023-07-26T12:47:14.551694Z", + "iopub.status.idle": "2023-07-26T12:47:15.102658Z", + "shell.execute_reply": "2023-07-26T12:47:15.102334Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -73,7 +80,14 @@ "cell_type": "code", "execution_count": null, "id": "6f195dcd", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:15.104419Z", + "iopub.status.busy": "2023-07-26T12:47:15.104301Z", + "iopub.status.idle": "2023-07-26T12:47:15.106415Z", + "shell.execute_reply": "2023-07-26T12:47:15.106177Z" + } + }, "outputs": [], "source": [ "OJ.shape" @@ -83,7 +97,14 @@ "cell_type": "code", "execution_count": null, "id": "aaafb83b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:15.107821Z", + "iopub.status.busy": "2023-07-26T12:47:15.107723Z", + "iopub.status.idle": "2023-07-26T12:47:15.109747Z", + "shell.execute_reply": "2023-07-26T12:47:15.109486Z" + } + }, "outputs": [], "source": [ "OJ.columns" @@ -93,7 +114,14 @@ "cell_type": "code", "execution_count": null, "id": "774dfa86", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:15.111066Z", + "iopub.status.busy": "2023-07-26T12:47:15.110974Z", + "iopub.status.idle": "2023-07-26T12:47:15.123225Z", + "shell.execute_reply": "2023-07-26T12:47:15.122965Z" + } + }, "outputs": [], "source": [ "OJ.describe().iloc[:,:4]" @@ -110,6 +138,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Portfolio.ipynb b/docs/source/datasets/Portfolio.ipynb index 6d6a60d..1a6d711 100644 --- a/docs/source/datasets/Portfolio.ipynb +++ b/docs/source/datasets/Portfolio.ipynb @@ -22,7 +22,14 @@ "cell_type": "code", "execution_count": null, "id": "3adff220", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:02.309375Z", + "iopub.status.busy": "2023-07-26T12:47:02.308873Z", + "iopub.status.idle": "2023-07-26T12:47:02.849537Z", + "shell.execute_reply": "2023-07-26T12:47:02.849247Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -34,7 +41,14 @@ "cell_type": "code", "execution_count": null, "id": "b02a9e67", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:02.851392Z", + "iopub.status.busy": "2023-07-26T12:47:02.851244Z", + "iopub.status.idle": "2023-07-26T12:47:02.853779Z", + "shell.execute_reply": "2023-07-26T12:47:02.853348Z" + } + }, "outputs": [], "source": [ "Portfolio.shape" @@ -44,7 +58,14 @@ "cell_type": "code", "execution_count": null, "id": "3e83a0ed", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:02.855660Z", + "iopub.status.busy": "2023-07-26T12:47:02.855540Z", + "iopub.status.idle": "2023-07-26T12:47:02.858065Z", + "shell.execute_reply": "2023-07-26T12:47:02.857779Z" + } + }, "outputs": [], "source": [ "Portfolio.columns" @@ -54,7 +75,14 @@ "cell_type": "code", "execution_count": null, "id": "3ebec412", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:02.859606Z", + "iopub.status.busy": "2023-07-26T12:47:02.859503Z", + "iopub.status.idle": "2023-07-26T12:47:02.865754Z", + "shell.execute_reply": "2023-07-26T12:47:02.865418Z" + } + }, "outputs": [], "source": [ "Portfolio.describe()" @@ -71,6 +99,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Publication.ipynb b/docs/source/datasets/Publication.ipynb index a4a6dfa..de4a449 100644 --- a/docs/source/datasets/Publication.ipynb +++ b/docs/source/datasets/Publication.ipynb @@ -45,7 +45,14 @@ "cell_type": "code", "execution_count": null, "id": "61d7c2b3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:29.196850Z", + "iopub.status.busy": "2023-07-26T12:47:29.196559Z", + "iopub.status.idle": "2023-07-26T12:47:29.727827Z", + "shell.execute_reply": "2023-07-26T12:47:29.727421Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -57,7 +64,14 @@ "cell_type": "code", "execution_count": null, "id": "4d72460d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:29.729844Z", + "iopub.status.busy": "2023-07-26T12:47:29.729686Z", + "iopub.status.idle": "2023-07-26T12:47:29.732275Z", + "shell.execute_reply": "2023-07-26T12:47:29.732008Z" + } + }, "outputs": [], "source": [ "Publication.shape" @@ -67,7 +81,14 @@ "cell_type": "code", "execution_count": null, "id": "fd34224c", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:29.734028Z", + "iopub.status.busy": "2023-07-26T12:47:29.733885Z", + "iopub.status.idle": "2023-07-26T12:47:29.736365Z", + "shell.execute_reply": "2023-07-26T12:47:29.736014Z" + } + }, "outputs": [], "source": [ "Publication.columns" @@ -77,7 +98,14 @@ "cell_type": "code", "execution_count": null, "id": "51bfb0aa", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:29.738169Z", + "iopub.status.busy": "2023-07-26T12:47:29.738046Z", + "iopub.status.idle": "2023-07-26T12:47:29.747027Z", + "shell.execute_reply": "2023-07-26T12:47:29.746722Z" + } + }, "outputs": [], "source": [ "Publication.describe().iloc[:,:4]" @@ -94,6 +122,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Smarket.ipynb b/docs/source/datasets/Smarket.ipynb index cced2a9..0be4dd9 100644 --- a/docs/source/datasets/Smarket.ipynb +++ b/docs/source/datasets/Smarket.ipynb @@ -41,7 +41,14 @@ "cell_type": "code", "execution_count": null, "id": "3d920337", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:21.928355Z", + "iopub.status.busy": "2023-07-26T12:47:21.927766Z", + "iopub.status.idle": "2023-07-26T12:47:22.480597Z", + "shell.execute_reply": "2023-07-26T12:47:22.480297Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -53,7 +60,14 @@ "cell_type": "code", "execution_count": null, "id": "25d90138", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:22.482125Z", + "iopub.status.busy": "2023-07-26T12:47:22.482016Z", + "iopub.status.idle": "2023-07-26T12:47:22.484017Z", + "shell.execute_reply": "2023-07-26T12:47:22.483801Z" + } + }, "outputs": [], "source": [ "Smarket.shape" @@ -63,7 +77,14 @@ "cell_type": "code", "execution_count": null, "id": "0e8c57de", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:22.485456Z", + "iopub.status.busy": "2023-07-26T12:47:22.485359Z", + "iopub.status.idle": "2023-07-26T12:47:22.487416Z", + "shell.execute_reply": "2023-07-26T12:47:22.487186Z" + } + }, "outputs": [], "source": [ "Smarket.columns" @@ -73,7 +94,14 @@ "cell_type": "code", "execution_count": null, "id": "2d455f1e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:22.488803Z", + "iopub.status.busy": "2023-07-26T12:47:22.488706Z", + "iopub.status.idle": "2023-07-26T12:47:22.497401Z", + "shell.execute_reply": "2023-07-26T12:47:22.497165Z" + } + }, "outputs": [], "source": [ "Smarket.describe().iloc[:,-4:]" @@ -90,6 +118,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/USArrests.ipynb b/docs/source/datasets/USArrests.ipynb index 1107424..d860098 100644 --- a/docs/source/datasets/USArrests.ipynb +++ b/docs/source/datasets/USArrests.ipynb @@ -28,9 +28,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "feab45d4-ce30-4ea9-800c-bbe9e7c11f6d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:56.351520Z", + "iopub.status.busy": "2023-07-26T12:46:56.350481Z", + "iopub.status.idle": "2023-07-26T12:46:58.021100Z", + "shell.execute_reply": "2023-07-26T12:46:58.019698Z" + } + }, "outputs": [], "source": [ "from statsmodels.datasets import get_rdataset\n", @@ -39,157 +46,51 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "bdfffad4-6ab1-45da-8d62-8a7c4326fb24", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(50, 4)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:58.027241Z", + "iopub.status.busy": "2023-07-26T12:46:58.026857Z", + "iopub.status.idle": "2023-07-26T12:46:58.034424Z", + "shell.execute_reply": "2023-07-26T12:46:58.033781Z" } - ], + }, + "outputs": [], "source": [ "USArrests.shape" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "02f28a67-e8b9-4a17-ad0d-88672e1de26d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Murder', 'Assault', 'UrbanPop', 'Rape'], dtype='object')" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:58.038173Z", + "iopub.status.busy": "2023-07-26T12:46:58.037943Z", + "iopub.status.idle": "2023-07-26T12:46:58.041828Z", + "shell.execute_reply": "2023-07-26T12:46:58.041345Z" } - ], + }, + "outputs": [], "source": [ "USArrests.columns" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "711db396-64d6-4fbd-9be4-bebe4117216f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MurderAssaultUrbanPopRape
count50.0000050.00000050.00000050.000000
mean7.78800170.76000065.54000021.232000
std4.3555183.33766114.4747639.366385
min0.8000045.00000032.0000007.300000
25%4.07500109.00000054.50000015.075000
50%7.25000159.00000066.00000020.100000
75%11.25000249.00000077.75000026.175000
max17.40000337.00000091.00000046.000000
\n", - "
" - ], - "text/plain": [ - " Murder Assault UrbanPop Rape\n", - "count 50.00000 50.000000 50.000000 50.000000\n", - "mean 7.78800 170.760000 65.540000 21.232000\n", - "std 4.35551 83.337661 14.474763 9.366385\n", - "min 0.80000 45.000000 32.000000 7.300000\n", - "25% 4.07500 109.000000 54.500000 15.075000\n", - "50% 7.25000 159.000000 66.000000 20.100000\n", - "75% 11.25000 249.000000 77.750000 26.175000\n", - "max 17.40000 337.000000 91.000000 46.000000" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:58.044543Z", + "iopub.status.busy": "2023-07-26T12:46:58.044381Z", + "iopub.status.idle": "2023-07-26T12:46:58.057559Z", + "shell.execute_reply": "2023-07-26T12:46:58.057142Z" } - ], + }, + "outputs": [], "source": [ "USArrests.describe()" ] @@ -216,7 +117,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Wage.ipynb b/docs/source/datasets/Wage.ipynb index b95d853..28bb484 100644 --- a/docs/source/datasets/Wage.ipynb +++ b/docs/source/datasets/Wage.ipynb @@ -53,7 +53,14 @@ "cell_type": "code", "execution_count": null, "id": "6832d321", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:04.731864Z", + "iopub.status.busy": "2023-07-26T12:47:04.731413Z", + "iopub.status.idle": "2023-07-26T12:47:05.295785Z", + "shell.execute_reply": "2023-07-26T12:47:05.295452Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -65,7 +72,14 @@ "cell_type": "code", "execution_count": null, "id": "1c1ad3f3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:05.297482Z", + "iopub.status.busy": "2023-07-26T12:47:05.297357Z", + "iopub.status.idle": "2023-07-26T12:47:05.299508Z", + "shell.execute_reply": "2023-07-26T12:47:05.299247Z" + } + }, "outputs": [], "source": [ "Wage.shape" @@ -75,7 +89,14 @@ "cell_type": "code", "execution_count": null, "id": "d56ab6a4", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:05.300989Z", + "iopub.status.busy": "2023-07-26T12:47:05.300875Z", + "iopub.status.idle": "2023-07-26T12:47:05.303024Z", + "shell.execute_reply": "2023-07-26T12:47:05.302786Z" + } + }, "outputs": [], "source": [ "Wage.columns" @@ -85,7 +106,14 @@ "cell_type": "code", "execution_count": null, "id": "5f021939", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:05.304606Z", + "iopub.status.busy": "2023-07-26T12:47:05.304487Z", + "iopub.status.idle": "2023-07-26T12:47:05.311771Z", + "shell.execute_reply": "2023-07-26T12:47:05.311522Z" + } + }, "outputs": [], "source": [ "Wage.describe()" @@ -102,6 +130,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/datasets/Weekly.ipynb b/docs/source/datasets/Weekly.ipynb index 69f26d6..15a1050 100644 --- a/docs/source/datasets/Weekly.ipynb +++ b/docs/source/datasets/Weekly.ipynb @@ -41,7 +41,14 @@ "cell_type": "code", "execution_count": null, "id": "d19dd431", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:41.468580Z", + "iopub.status.busy": "2023-07-26T12:47:41.468291Z", + "iopub.status.idle": "2023-07-26T12:47:41.999679Z", + "shell.execute_reply": "2023-07-26T12:47:41.999341Z" + } + }, "outputs": [], "source": [ "from ISLP import load_data\n", @@ -53,7 +60,14 @@ "cell_type": "code", "execution_count": null, "id": "17d2cda4", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:42.002632Z", + "iopub.status.busy": "2023-07-26T12:47:42.002470Z", + "iopub.status.idle": "2023-07-26T12:47:42.004871Z", + "shell.execute_reply": "2023-07-26T12:47:42.004611Z" + } + }, "outputs": [], "source": [ "Weekly.shape" @@ -63,7 +77,14 @@ "cell_type": "code", "execution_count": null, "id": "f822715b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:42.006534Z", + "iopub.status.busy": "2023-07-26T12:47:42.006422Z", + "iopub.status.idle": "2023-07-26T12:47:42.008496Z", + "shell.execute_reply": "2023-07-26T12:47:42.008187Z" + } + }, "outputs": [], "source": [ "Weekly.columns" @@ -73,7 +94,14 @@ "cell_type": "code", "execution_count": null, "id": "9a5f4d04", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:47:42.010010Z", + "iopub.status.busy": "2023-07-26T12:47:42.009911Z", + "iopub.status.idle": "2023-07-26T12:47:42.019036Z", + "shell.execute_reply": "2023-07-26T12:47:42.018706Z" + } + }, "outputs": [], "source": [ "Weekly.describe().iloc[:,:4]" @@ -98,6 +126,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/docs_version.json b/docs/source/docs_version.json new file mode 100644 index 0000000..d6217ce --- /dev/null +++ b/docs/source/docs_version.json @@ -0,0 +1,4 @@ +{"labs": "v2.2", + "library": "v0.4.0", + "comment":"labs should be version of ISLP pointed to in ISLP_labs/README.md, library version should be explicitly marked in ISLP_labs/requirements.txt; don't forget to strip warnings!!!!!!!!" +} diff --git a/docs/source/helpers/cluster.ipynb b/docs/source/helpers/cluster.ipynb index 56cf3d8..4aa7de3 100644 --- a/docs/source/helpers/cluster.ipynb +++ b/docs/source/helpers/cluster.ipynb @@ -8,14 +8,21 @@ "# Clustering\n", "\n", "This module has a single function, used to help visualize a dendrogram from a\n", - "hierarchical clustering." + "hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)." ] }, { "cell_type": "code", "execution_count": null, "id": "d5df152d", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:42.214971Z", + "iopub.status.busy": "2023-07-26T12:46:42.214537Z", + "iopub.status.idle": "2023-07-26T12:46:42.860533Z", + "shell.execute_reply": "2023-07-26T12:46:42.860243Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -36,7 +43,14 @@ "cell_type": "code", "execution_count": null, "id": "0135c1fb", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:42.862401Z", + "iopub.status.busy": "2023-07-26T12:46:42.862250Z", + "iopub.status.idle": "2023-07-26T12:46:42.864336Z", + "shell.execute_reply": "2023-07-26T12:46:42.864118Z" + } + }, "outputs": [], "source": [ "rng = np.random.default_rng(1)\n", @@ -56,7 +70,14 @@ "cell_type": "code", "execution_count": null, "id": "17c52650", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:42.865831Z", + "iopub.status.busy": "2023-07-26T12:46:42.865731Z", + "iopub.status.idle": "2023-07-26T12:46:42.867386Z", + "shell.execute_reply": "2023-07-26T12:46:42.867147Z" + } + }, "outputs": [], "source": [ "clust = AgglomerativeClustering(distance_threshold=0,\n", @@ -68,7 +89,14 @@ "cell_type": "code", "execution_count": null, "id": "a3ae2622", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:42.868746Z", + "iopub.status.busy": "2023-07-26T12:46:42.868668Z", + "iopub.status.idle": "2023-07-26T12:46:42.872497Z", + "shell.execute_reply": "2023-07-26T12:46:42.872240Z" + } + }, "outputs": [], "source": [ "clust.fit(X)" @@ -86,7 +114,14 @@ "cell_type": "code", "execution_count": null, "id": "64e726a4", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:42.873930Z", + "iopub.status.busy": "2023-07-26T12:46:42.873845Z", + "iopub.status.idle": "2023-07-26T12:46:43.195508Z", + "shell.execute_reply": "2023-07-26T12:46:43.195084Z" + } + }, "outputs": [], "source": [ "linkage = compute_linkage(clust)\n", @@ -101,9 +136,21 @@ "main_language": "python" }, "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/helpers/pygam.ipynb b/docs/source/helpers/pygam.ipynb index aab61d1..b452294 100644 --- a/docs/source/helpers/pygam.ipynb +++ b/docs/source/helpers/pygam.ipynb @@ -16,7 +16,14 @@ "cell_type": "code", "execution_count": null, "id": "9a52fb27", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:47.820912Z", + "iopub.status.busy": "2023-07-26T12:46:47.820490Z", + "iopub.status.idle": "2023-07-26T12:46:48.577304Z", + "shell.execute_reply": "2023-07-26T12:46:48.577007Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -46,7 +53,14 @@ "cell_type": "code", "execution_count": null, "id": "4bddce77", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.579295Z", + "iopub.status.busy": "2023-07-26T12:46:48.579114Z", + "iopub.status.idle": "2023-07-26T12:46:48.581608Z", + "shell.execute_reply": "2023-07-26T12:46:48.581355Z" + } + }, "outputs": [], "source": [ "rng = np.random.default_rng(1)\n", @@ -69,7 +83,14 @@ "cell_type": "code", "execution_count": null, "id": "3f8946e0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.583287Z", + "iopub.status.busy": "2023-07-26T12:46:48.583187Z", + "iopub.status.idle": "2023-07-26T12:46:48.618486Z", + "shell.execute_reply": "2023-07-26T12:46:48.614888Z" + } + }, "outputs": [], "source": [ "terms = [s(f, lam=0.01) for f in range(3)]\n", @@ -91,7 +112,14 @@ "cell_type": "code", "execution_count": null, "id": "c5b38706", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.624580Z", + "iopub.status.busy": "2023-07-26T12:46:48.624177Z", + "iopub.status.idle": "2023-07-26T12:46:48.814238Z", + "shell.execute_reply": "2023-07-26T12:46:48.808746Z" + } + }, "outputs": [], "source": [ "ax = plot(gam, 0)" @@ -109,7 +137,14 @@ "cell_type": "code", "execution_count": null, "id": "e4d2b6f0", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.825281Z", + "iopub.status.busy": "2023-07-26T12:46:48.824327Z", + "iopub.status.idle": "2023-07-26T12:46:48.897739Z", + "shell.execute_reply": "2023-07-26T12:46:48.897447Z" + } + }, "outputs": [], "source": [ "ax.scatter(X[:,0], \n", @@ -131,7 +166,14 @@ "cell_type": "code", "execution_count": null, "id": "82374baa", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.899404Z", + "iopub.status.busy": "2023-07-26T12:46:48.899288Z", + "iopub.status.idle": "2023-07-26T12:46:48.916570Z", + "shell.execute_reply": "2023-07-26T12:46:48.915079Z" + } + }, "outputs": [], "source": [ "[degrees_of_freedom(X,\n", @@ -153,7 +195,14 @@ "cell_type": "code", "execution_count": null, "id": "0576d1f3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.924539Z", + "iopub.status.busy": "2023-07-26T12:46:48.924174Z", + "iopub.status.idle": "2023-07-26T12:46:48.955630Z", + "shell.execute_reply": "2023-07-26T12:46:48.954722Z" + } + }, "outputs": [], "source": [ "lam_vals = [approx_lam(X,\n", @@ -174,7 +223,14 @@ "cell_type": "code", "execution_count": null, "id": "3a8b546e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.961056Z", + "iopub.status.busy": "2023-07-26T12:46:48.960521Z", + "iopub.status.idle": "2023-07-26T12:46:48.989331Z", + "shell.execute_reply": "2023-07-26T12:46:48.987244Z" + } + }, "outputs": [], "source": [ "fixed_terms = [s(f, lam=l) for \n", @@ -189,7 +245,14 @@ "cell_type": "code", "execution_count": null, "id": "f2cfbea2", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:48.995461Z", + "iopub.status.busy": "2023-07-26T12:46:48.994945Z", + "iopub.status.idle": "2023-07-26T12:46:49.130069Z", + "shell.execute_reply": "2023-07-26T12:46:49.129127Z" + } + }, "outputs": [], "source": [ "ax = plot(fixed_gam, 0)\n", @@ -210,6 +273,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/helpers/survival.ipynb b/docs/source/helpers/survival.ipynb index 7cb30a3..f90123e 100644 --- a/docs/source/helpers/survival.ipynb +++ b/docs/source/helpers/survival.ipynb @@ -15,7 +15,14 @@ "cell_type": "code", "execution_count": null, "id": "0932cabc", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:45.058072Z", + "iopub.status.busy": "2023-07-26T12:46:45.057742Z", + "iopub.status.idle": "2023-07-26T12:46:45.657730Z", + "shell.execute_reply": "2023-07-26T12:46:45.657332Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -40,7 +47,14 @@ "cell_type": "code", "execution_count": null, "id": "d82896bb", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:45.659634Z", + "iopub.status.busy": "2023-07-26T12:46:45.659493Z", + "iopub.status.idle": "2023-07-26T12:46:45.661327Z", + "shell.execute_reply": "2023-07-26T12:46:45.661109Z" + } + }, "outputs": [], "source": [ "cum_haz = lambda t: t\n", @@ -51,7 +65,14 @@ "cell_type": "code", "execution_count": null, "id": "c9f9d590", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:45.662631Z", + "iopub.status.busy": "2023-07-26T12:46:45.662534Z", + "iopub.status.idle": "2023-07-26T12:46:45.672267Z", + "shell.execute_reply": "2023-07-26T12:46:45.672017Z" + } + }, "outputs": [], "source": [ "T = np.array([sim_time(np.log(2), cum_haz, rng) for _ in range(500)])" @@ -69,7 +90,14 @@ "cell_type": "code", "execution_count": null, "id": "2d8478dc", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:45.673768Z", + "iopub.status.busy": "2023-07-26T12:46:45.673685Z", + "iopub.status.idle": "2023-07-26T12:46:45.934676Z", + "shell.execute_reply": "2023-07-26T12:46:45.934321Z" + } + }, "outputs": [], "source": [ "kmf = KaplanMeierFitter(label=\"Simulated data\")\n", @@ -111,6 +139,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/helpers/svm.ipynb b/docs/source/helpers/svm.ipynb index 593d840..eb950b5 100644 --- a/docs/source/helpers/svm.ipynb +++ b/docs/source/helpers/svm.ipynb @@ -14,7 +14,14 @@ "cell_type": "code", "execution_count": null, "id": "2746a357", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:51.026740Z", + "iopub.status.busy": "2023-07-26T12:46:51.026289Z", + "iopub.status.idle": "2023-07-26T12:46:51.779743Z", + "shell.execute_reply": "2023-07-26T12:46:51.779280Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -34,7 +41,14 @@ "cell_type": "code", "execution_count": null, "id": "4728535b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:51.781697Z", + "iopub.status.busy": "2023-07-26T12:46:51.781546Z", + "iopub.status.idle": "2023-07-26T12:46:51.783810Z", + "shell.execute_reply": "2023-07-26T12:46:51.783514Z" + } + }, "outputs": [], "source": [ "rng = np.random.default_rng(1)\n", @@ -56,7 +70,14 @@ "cell_type": "code", "execution_count": null, "id": "74da6860", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:51.785373Z", + "iopub.status.busy": "2023-07-26T12:46:51.785272Z", + "iopub.status.idle": "2023-07-26T12:46:51.789605Z", + "shell.execute_reply": "2023-07-26T12:46:51.789351Z" + } + }, "outputs": [], "source": [ "svm = SVC(kernel='linear')\n", @@ -67,7 +88,14 @@ "cell_type": "code", "execution_count": null, "id": "d87b6f75", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:51.790987Z", + "iopub.status.busy": "2023-07-26T12:46:51.790907Z", + "iopub.status.idle": "2023-07-26T12:46:51.883284Z", + "shell.execute_reply": "2023-07-26T12:46:51.882993Z" + } + }, "outputs": [], "source": [ "plot(X, Y, svm)" @@ -89,7 +117,14 @@ "cell_type": "code", "execution_count": null, "id": "bc58956a", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-26T12:46:51.884984Z", + "iopub.status.busy": "2023-07-26T12:46:51.884867Z", + "iopub.status.idle": "2023-07-26T12:46:52.011375Z", + "shell.execute_reply": "2023-07-26T12:46:52.011081Z" + } + }, "outputs": [], "source": [ "plot(X, Y, svm, features=(3, 4))" @@ -106,6 +141,18 @@ "display_name": "python3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/imdb.ipynb b/docs/source/imdb.ipynb index 1718a58..d9ba5cb 100644 --- a/docs/source/imdb.ipynb +++ b/docs/source/imdb.ipynb @@ -5,71 +5,109 @@ "id": "50f2b809", "metadata": {}, "source": [ - "# Creating a clean IMDB dataset\n", + "# Creating IMDB dataset from `keras` version\n", + "\n", + "This script details how the `IMDB` data in `ISLP` was constructed.\n", "\n", "Running this example requires `keras`. Use `pip install keras` to install if necessary." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d920bb2e", "metadata": {}, "outputs": [], "source": [ - "import pickle" + "import pickle\n", + "import numpy as np\n", + "from scipy.sparse import coo_matrix, save_npz\n", + "import torch\n", + "from keras.datasets import imdb\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e507f1fb", + "cell_type": "markdown", + "id": "eaf27f0c-0cb0-4ad5-8775-d138e3f20933", "metadata": {}, - "outputs": [], "source": [ - "import numpy as np\n", - "from scipy.sparse import coo_matrix, save_npz\n", - "import torch" + "We first load the data using `keras`, limiting focus to the 10000 most commmon words." ] }, { "cell_type": "code", - "execution_count": null, - "id": "b94d3f35", + "execution_count": 2, + "id": "29f0e01e", "metadata": {}, "outputs": [], "source": [ - "from keras.datasets import imdb\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences" + "# the 3 is for three terms: \n", + "num_words = 10000+3\n", + "((S_train, L_train), \n", + " (S_test, L_test)) = imdb.load_data(num_words=num_words)" + ] + }, + { + "cell_type": "markdown", + "id": "9020ab27-cc62-4b86-85ba-80a94ff692de", + "metadata": {}, + "source": [ + "The object `S_train` is effectively a list in which each document has been encoded into a sequence of\n", + "values from 0 to 10002." ] }, { "cell_type": "code", - "execution_count": null, - "id": "29f0e01e", + "execution_count": 3, + "id": "e27564c4-320f-42b6-9f2e-2a2afdebefcf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# the 3 is for three terms: \n", - "num_words = 10000+3\n", - "((S_train, Y_train), \n", - " (S_test, Y_test)) = imdb.load_data(num_words=num_words)" + "S_train[0][:10]" + ] + }, + { + "cell_type": "markdown", + "id": "15f039fe-faed-4884-a725-1c51d6c8d4d4", + "metadata": {}, + "source": [ + "We'll use `np.float32` as that is the common precision used in `torch`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "6cc3c3cb", "metadata": {}, "outputs": [], "source": [ - "Y_train = Y_train.astype(np.float32)\n", - "Y_test = Y_test.astype(np.float32)" + "L_train = L_train.astype(np.float32)\n", + "L_test = L_test.astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "id": "005679bc-4337-4757-831e-f9a6ea50f6aa", + "metadata": {}, + "source": [ + "We will use a one-hot encoding that captures whether or not a given word appears in a given review." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7b6d1098", "metadata": {}, "outputs": [], @@ -88,18 +126,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "afcdc8b2", "metadata": {}, "outputs": [], "source": [ - "X_train, L_train = one_hot(S_train, num_words), Y_train\n", + "X_train = one_hot(S_train, num_words)\n", "X_test = one_hot(S_test, num_words)" ] }, + { + "cell_type": "markdown", + "id": "a67e299d-8774-4758-8953-77afdce775ab", + "metadata": {}, + "source": [ + "## Store as sparse tensors\n", + "\n", + "We see later in the lab that the dense representation is faster. Nevertheless,\n", + "let's store the one-hot representation as sparse `torch` tensors \n", + "as well as sparse `scipy` matrices." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b19366ea", "metadata": {}, "outputs": [], @@ -115,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "b45ae6d1", "metadata": {}, "outputs": [], @@ -126,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a47d6eb6", "metadata": {}, "outputs": [], @@ -137,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "d1b37b37", "metadata": {}, "outputs": [], @@ -151,12 +201,12 @@ "id": "1119823a", "metadata": {}, "source": [ - "save the sparse matrices" + "### Save as sparse `scipy` matrices" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "6cb6bfdf", "metadata": {}, "outputs": [], @@ -167,12 +217,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "eac1c2ae", "metadata": {}, "outputs": [], "source": [ - "np.save('IMDB_Y_test.npy', Y_test)\n", + "np.save('IMDB_Y_test.npy', L_test)\n", "np.save('IMDB_Y_train.npy', L_train)" ] }, @@ -181,12 +231,14 @@ "id": "25c128e3", "metadata": {}, "source": [ - "save and pickle the word index" + "## Save and pickle the word index\n", + "\n", + "We'll also want to store a lookup table to convert representations such as `S_train[0]` into words" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "8458bf67", "metadata": {}, "outputs": [], @@ -199,9 +251,46 @@ "lookup[4] = \"\"" ] }, + { + "cell_type": "markdown", + "id": "5e62ebff-2575-4d35-b46c-51c6f7598efc", + "metadata": {}, + "source": [ + "Let's look at our first training document:" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "id": "2aaefdf8-0a49-4bdb-8b40-55665283c8a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\" this film was just brilliant casting location scenery story direction everyone's really suited part they played and you\"" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "' '.join([lookup[i] for i in S_train[0][:20]])" + ] + }, + { + "cell_type": "markdown", + "id": "0e985a73-bfd9-42bd-a523-3dc6e223d602", + "metadata": {}, + "source": [ + "We save this lookup table so it can be loaded later " + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "d95252de", "metadata": {}, "outputs": [], @@ -214,12 +303,15 @@ "id": "b3d900b9", "metadata": {}, "source": [ - "create the padded representations" + "## Padded representations\n", + "\n", + "For some of the recurrent models, we'll need sequences of common lengths, padded if necessary.\n", + "Here, we pad up to a maximum length of 500, filling the remaining entries with 0." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "637b3c5e", "metadata": {}, "outputs": [], @@ -230,9 +322,17 @@ " S_test]]" ] }, + { + "cell_type": "markdown", + "id": "a6218300-b355-44cc-b7fb-4bff81211aa6", + "metadata": {}, + "source": [ + "Finally, we save these for later use in the deep learning lab." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "bac69f88", "metadata": {}, "outputs": [], @@ -245,13 +345,24 @@ "metadata": { "jupytext": { "cell_metadata_filter": "-all", - "formats": "source///ipynb,jupyterbook///md:myst,jupyterbook///ipynb", - "main_language": "python" + "formats": "md,ipynb" }, "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, diff --git a/docs/source/index.rst b/docs/source/index.rst index 2c80bdc..44b40fc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,8 +3,7 @@ Welcome to ISLP documentation! .. automodule:: ISLP -Check out the :doc:`installation` section for further information. - +See the :doc:`api/index` Contents -------- @@ -16,5 +15,6 @@ Contents transforms models helpers + labs imdb - api/index + diff --git a/docs/source/installation.myst b/docs/source/installation.myst new file mode 100644 index 0000000..5fba989 --- /dev/null +++ b/docs/source/installation.myst @@ -0,0 +1,119 @@ +--- +file_format: mystnb +kernelspec: + name: python3 + display_name: python3 +--- + + +# Install instructions + +We generally recommend creating a [conda](https://anaconda.org) environment to isolate any code +from other dependencies. The `ISLP` package does not have unusual dependencies, but this is still +good practice. + +## Mac OS X / Linux + +To create a Python conda environment in a Mac OS X or Linux environment run: + +```{code-cell} ipython3 +--- +tags: [skip-execution] +--- +conda create --name islp python +``` + +Current conda should have this at least 3.9. If not, replace `python` +with `python=3.10`, `python=3.11` or `python=3.12`. +To run python +code in this environment, you must activate it: + +```{code-cell} ipython3 +--- +tags: [skip-execution] +--- +conda activate islp +``` + +## Windows + +On windows, create a `Python` environment called `islp` in the Anaconda app. This can be done by selecting `Environments` on the left hand side of the app's screen. After creating the environment, open a terminal within that environment by clicking on the "Play" button. + +# Installing `ISLP` + +Having completed the steps above, we use `pip` to install the `ISLP` package: + +```{code-cell} ipython3 +--- +tags: [skip-execution] +--- +pip install ISLP +``` + +## Frozen environment + +```{attention} + +Python packages change frequently. The labs here are built +with {{ ISLP_lab_link }}. Visit the lab git repo for specific instructions +to install the frozen environment. +``` + +## Torch requirements + +The `ISLP` labs use `torch` and various related packages for the lab +on deep learning. Most of the requirements are included in the requirements for `ISLP` though the labs +also use `torchinfo` and `torchvision`. These will be installed by the `requirements.txt` above. + +```{attention} +Because +`torch` and related libraries change frequently, you will note that we +have pinned the versions at specific versions that were used to make +current verisons of the labs. +``` + +## Jupyter + +```{attention} +If using the Anaconda App, `jupyter` can be installed with a GUI. Use +the GUI install instead of the `pip` install below. +``` + +### Mac OS X + +```{attention} + +If you are using the Anaconda GUI, it is recommended that you install JupyterLab through the GUI +and skip the step below. Installing both through the GUI and `pip` may result in conflicts and +a broken JupyterLab. + +If you have installed JupyterLab in your environment via the GUI, the above call `pip install ISLP` may be made within +any running notebook within that environment. +``` + +If JupyterLab is not already installed, run the following after having activated your `islp` environment: + +```{code-cell} ipython3 +--- +tags: [skip-execution] +--- +pip install jupyterlab +``` + +### Windows + +Either use the same `pip` command above or install JupyterLab from the +`Home` tab. Ensure that the environment is your `islp` +environment. This information appears near the top left in the +Anaconda `Home` page. + +# Google Colab + +The notebooks for the labs can be run in [Google +Colab](https://colab.research.google.com) with a few caveats: + +- Labs that use files in the filesystem will require one to mount your + Google Drive. See Google's [help](https://colab.research.google.com/notebooks/io.ipynb). + +- The packages will have to be reinstalled each time a new runtime is started. +For most labs, inserting `pip install ISLP` at the top of the notebook will suffice, though Colab will ask you to restart after installation. diff --git a/docs/source/installation.rst b/docs/source/installation.rst deleted file mode 100644 index 981b1ae..0000000 --- a/docs/source/installation.rst +++ /dev/null @@ -1,18 +0,0 @@ -Usage -===== - -.. _installation: - -Installation ------------- - -To use ISLP, first install it using pip: - -.. code-block:: console - - (.venv) $ pip install ISLP - -Creating recipes ----------------- - -BLAH diff --git a/docs/source/labs.myst b/docs/source/labs.myst new file mode 100644 index 0000000..b33bd3d --- /dev/null +++ b/docs/source/labs.myst @@ -0,0 +1,58 @@ +--- +file_format: mystnb +kernelspec: + name: python3 + display_name: python3 +myst_number_code_blocks: python +--- + +# Labs + +{{ ISLP_binder_code }} + +The current version of the labs for `ISLP` are included here. + + +## Package versions + + +```{attention} + +Python packages change frequently. The labs here are built +with {{ ISLP_lab_link }}. Visit the lab git repo for specific instructions +to install the frozen environment. + + +A zip file containig all the labs and data files can be downloaded +here {{ ISLP_zip_link }}. + +``` + +```{warning} +The version of the `ISLP` library used to build these labs +may differ slightly from the one documented here. +The labs are built with {{ ISLP_lab_version }}. + +The [Binder](http://mybinder.org) link above will run {{ ISLP_lab_link }} with +library version {{ ISLP_lab_version }}. + +``` + + +```{toctree} +maxdepth: 1 + +labs/Ch02-statlearn-lab +labs/Ch03-linreg-lab +labs/Ch04-classification-lab +labs/Ch05-resample-lab +labs/Ch06-varselect-lab +labs/Ch07-nonlin-lab +labs/Ch08-baggboost-lab +labs/Ch09-svm-lab +labs/Ch10-deeplearning-lab +labs/Ch11-surv-lab +labs/Ch12-unsup-lab +labs/Ch13-multiple-lab +``` + diff --git a/docs/source/logo.png b/docs/source/logo.png new file mode 100644 index 0000000..237c1cd Binary files /dev/null and b/docs/source/logo.png differ diff --git a/docs/source/models.rst b/docs/source/models.rst index b34581f..5f9e5c8 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -4,8 +4,8 @@ Tools for regression models .. toctree:: models/spec - models/derived - models/submodels models/selection + models/anova + diff --git a/docs/source/models/anova.ipynb b/docs/source/models/anova.ipynb new file mode 100644 index 0000000..41e8bcb --- /dev/null +++ b/docs/source/models/anova.ipynb @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ee33d364", + "metadata": {}, + "source": [ + "# ANOVA using `ModelSpec`\n", + "\n", + "\n", + "In this lab we illustrate how to run create specific ANOVA analyses\n", + "using `ModelSpec`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c70fbaa", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from statsmodels.api import OLS\n", + "from statsmodels.stats.anova import anova_lm\n", + "\n", + "from ISLP import load_data\n", + "from ISLP.models import (ModelSpec,\n", + " derived_feature,\n", + " summarize)" + ] + }, + { + "cell_type": "markdown", + "id": "333a49cf", + "metadata": {}, + "source": [ + "We will carry out two simple ANOVA analyses of the `Hitters` data.\n", + "We wish to predict a baseball player’s `Salary` on the\n", + "basis of various statistics associated with performance in the\n", + "previous year." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8a708215", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "59" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Hitters = load_data('Hitters')\n", + "np.isnan(Hitters['Salary']).sum()" + ] + }, + { + "cell_type": "markdown", + "id": "dad5e991", + "metadata": {}, + "source": [ + " \n", + " We see that `Salary` is missing for 59 players. The\n", + "`dropna()` method of data frames removes all of the rows that have missing\n", + "values in any variable (by default --- see `Hitters.dropna?`)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ac7086a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n", + " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n", + " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Hitters = Hitters.dropna()\n", + "Hitters.columns" + ] + }, + { + "cell_type": "markdown", + "id": "1a0a3521-be74-40df-a404-3895d80a11dc", + "metadata": {}, + "source": [ + "## Grouping variables\n", + "\n", + "A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows\n", + "that there are both career and 1986 offensive stats, as well as some defensive stats.\n", + "\n", + "Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a215e43b-7bc8-4bdd-91cf-40d717cd7978", + "metadata": {}, + "outputs": [], + "source": [ + "confounders = derived_feature(['Division', 'League', 'NewLeague'],\n", + " name='confounders')\n", + "offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'],\n", + " name='offense_career')\n", + "offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'],\n", + " name='offense_1986')\n", + "defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'],\n", + " name='defense_1986')" + ] + }, + { + "cell_type": "markdown", + "id": "aa15fd0c-1e8a-431e-8425-c61da8439976", + "metadata": {}, + "source": [ + "We'll first do a sequential ANOVA where terms are added sequentially" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "40cd6c28", + "metadata": {}, + "outputs": [], + "source": [ + "design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters)\n", + "Y = np.array(Hitters['Salary'])\n", + "X = design.transform(Hitters)" + ] + }, + { + "cell_type": "markdown", + "id": "074120b1", + "metadata": {}, + "source": [ + "Along with a score we need to specify the search strategy. This is done through the object\n", + "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n", + "runs forward stepwise until any further additions to the model do not result\n", + "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n", + "runs a fixed number of steps of stepwise search." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e65f5607", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
coefstd errtP>|t|
intercept148.218773.5952.0140.045
Division[W]-116.040440.188-2.8870.004
League[N]63.750379.0060.8070.421
NewLeague[N]-24.398978.843-0.3090.757
CAtBat-0.18870.120-1.5720.117
CHits0.16360.6650.2460.806
CHmRun-0.15171.612-0.0940.925
CRuns1.47160.7471.9710.050
CRBI0.80210.6911.1610.247
CWalks-0.81240.327-2.4810.014
PutOuts0.28270.0773.6610.000
Assists0.37550.2201.7050.089
Errors-3.29404.377-0.7530.452
AtBat-1.95090.624-3.1250.002
Hits7.43952.3633.1480.002
HmRun4.34496.1900.7020.483
Runs-2.33122.971-0.7850.433
RBI-1.06702.595-0.4110.681
Walks6.21961.8253.4090.001
\n", + "
" + ], + "text/plain": [ + " coef std err t P>|t|\n", + "intercept 148.2187 73.595 2.014 0.045\n", + "Division[W] -116.0404 40.188 -2.887 0.004\n", + "League[N] 63.7503 79.006 0.807 0.421\n", + "NewLeague[N] -24.3989 78.843 -0.309 0.757\n", + "CAtBat -0.1887 0.120 -1.572 0.117\n", + "CHits 0.1636 0.665 0.246 0.806\n", + "CHmRun -0.1517 1.612 -0.094 0.925\n", + "CRuns 1.4716 0.747 1.971 0.050\n", + "CRBI 0.8021 0.691 1.161 0.247\n", + "CWalks -0.8124 0.327 -2.481 0.014\n", + "PutOuts 0.2827 0.077 3.661 0.000\n", + "Assists 0.3755 0.220 1.705 0.089\n", + "Errors -3.2940 4.377 -0.753 0.452\n", + "AtBat -1.9509 0.624 -3.125 0.002\n", + "Hits 7.4395 2.363 3.148 0.002\n", + "HmRun 4.3449 6.190 0.702 0.483\n", + "Runs -2.3312 2.971 -0.785 0.433\n", + "RBI -1.0670 2.595 -0.411 0.681\n", + "Walks 6.2196 1.825 3.409 0.001" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "M = OLS(Y, X).fit()\n", + "summarize(M)" + ] + }, + { + "cell_type": "markdown", + "id": "29d9b55f", + "metadata": {}, + "source": [ + "We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares\n", + "two successive models." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cfbe5b92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
df_residssrdf_diffss_diffFPr(>F)
intercept262.05.331911e+070.0NaNNaNNaN
confounders259.05.131263e+073.02.006478e+066.7411472.144265e-04
offense_career253.03.059130e+076.02.072134e+0734.8086561.470455e-30
defense_1986250.02.730614e+073.03.285156e+0611.0371117.880207e-07
offense_1986244.02.420857e+076.03.097572e+065.2034444.648586e-05
\n", + "
" + ], + "text/plain": [ + " df_resid ssr df_diff ss_diff F \\\n", + "intercept 262.0 5.331911e+07 0.0 NaN NaN \n", + "confounders 259.0 5.131263e+07 3.0 2.006478e+06 6.741147 \n", + "offense_career 253.0 3.059130e+07 6.0 2.072134e+07 34.808656 \n", + "defense_1986 250.0 2.730614e+07 3.0 3.285156e+06 11.037111 \n", + "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n", + "\n", + " Pr(>F) \n", + "intercept NaN \n", + "confounders 2.144265e-04 \n", + "offense_career 1.470455e-30 \n", + "defense_1986 7.880207e-07 \n", + "offense_1986 4.648586e-05 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')])\n", + "df.index = design.names\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "7092f666", + "metadata": {}, + "source": [ + "We can similarly compute the Type II ANOVA results which drops each term and compares to the full model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e2d43844", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
df_residssrdf_diffss_diffFPr(>F)
intercept244.02.420857e+071.04.024254e+054.0560764.511037e-02
confounders244.02.420857e+073.09.661738e+053.2460462.261572e-02
offense_career244.02.420857e+076.01.051025e+0717.6555965.701196e-17
defense_1986244.02.420857e+073.01.467933e+064.9318032.415732e-03
offense_1986244.02.420857e+076.03.097572e+065.2034444.648586e-05
\n", + "
" + ], + "text/plain": [ + " df_resid ssr df_diff ss_diff F \\\n", + "intercept 244.0 2.420857e+07 1.0 4.024254e+05 4.056076 \n", + "confounders 244.0 2.420857e+07 3.0 9.661738e+05 3.246046 \n", + "offense_career 244.0 2.420857e+07 6.0 1.051025e+07 17.655596 \n", + "defense_1986 244.0 2.420857e+07 3.0 1.467933e+06 4.931803 \n", + "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n", + "\n", + " Pr(>F) \n", + "intercept 4.511037e-02 \n", + "confounders 2.261572e-02 \n", + "offense_career 5.701196e-17 \n", + "defense_1986 2.415732e-03 \n", + "offense_1986 4.648586e-05 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "D_full = design.transform(Hitters)\n", + "OLS_full = OLS(Y, D_full).fit()\n", + "dfs = []\n", + "for d in design.build_sequence(Hitters, anova_type='drop'):\n", + " dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:])\n", + "df = pd.concat(dfs)\n", + "df.index = design.names\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "362709ae-9558-4c4c-8f5e-f8388caf631d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" + }, + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/models/derived.ipynb b/docs/source/models/derived.ipynb deleted file mode 100644 index cc1b0ac..0000000 --- a/docs/source/models/derived.ipynb +++ /dev/null @@ -1,2125 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "38217f02", - "metadata": {}, - "source": [ - "# Building design matrices with `ModelSpec`\n", - "\n", - "Force rebuild" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3107d1f9", - "metadata": {}, - "outputs": [], - "source": [ - "x=4\n", - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", - "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cdc46a4e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "e0a2a83a", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "68b40caf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "35558d88", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e5e81a95", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "4bbf9e13", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1ad729b3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d05e9ec8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "b4e9ee33", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "64ac65d3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "620f0e01", - "metadata": {}, - "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "77b898e0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "4580a6bf", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c2dab855", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['OIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5e7963d6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "markdown", - "id": "6b689966", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ff3b96b6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "7e87da20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" - ] - }, - { - "cell_type": "markdown", - "id": "4f2030ac", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "27fc4fb3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "16316981", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "ef3f2bd0", - "metadata": {}, - "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "dd9c7fa6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" - ] - }, - { - "cell_type": "markdown", - "id": "5fc4cc45", - "metadata": {}, - "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "49d7fb46", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc0fe9", - "metadata": {}, - "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "cf6f3f4c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "1552d19a", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "12d955dd", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "f5ea292d", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "ae2af29b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "57305dbe", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "89656ec4", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "f6cb8167", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "547cb625", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ff5b41d5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "932759cf", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "e2190b00", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "6545c5da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "cd088b51", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "8f37ae20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "184aefc2", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "e4134980", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "53808f3b", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "62059c57", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "ded12f69", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "fbb509d1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "f01391e4", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "10df55ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "b43099fb", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "50bce64d", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "2eb2ff16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "6686dff8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "0e0eafd7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "43cce209", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "99bf408e", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "11c19ebf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "4b48e5d2", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "81f641ba", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "79f7eb4d", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "2afb3b5d", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "c44692ab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "c0bfb2a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "d263056c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "edf0dc68", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "82071a54", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "cd18a4a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "229fa32d", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "b8c52dbb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "e7f93464", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "4094c01f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "d448c9ca", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "634e05c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "4c09c93f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "48c1989f", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "85a28d87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "e17c8a9d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3)[0] 10.036373\n", - "poly(Income, 3)[1] -2.799156\n", - "poly(Income, 3)[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "944f56d6", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "1889caca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "bd4dca31", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "70fae990", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca" - ] - }, - { - "cell_type": "markdown", - "id": "2d812694", - "metadata": {}, - "source": [ - "## Custom encoding\n", - "\n", - "Instead of PCA we might run some clustering on some features and then uses the clusters to\n", - "create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples\n", - "of this." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "8e5d2305", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import derived_variable, Contrast" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "8a40c663", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 0, 2,\n", - " 2, 2, 1, 2, 1, 0, 0, 1, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2,\n", - " 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2,\n", - " 0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 0, 1, 2,\n", - " 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 2, 0, 1, 0,\n", - " 0, 1, 0, 2, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,\n", - " 0, 0, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 0, 1, 2, 2,\n", - " 1, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 0,\n", - " 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 1, 1, 2, 2, 0, 1, 1, 2, 2,\n", - " 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 1,\n", - " 1, 2, 0, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 0, 0,\n", - " 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2,\n", - " 2, 2, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 2,\n", - " 1, 2, 2, 1, 1, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0,\n", - " 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 0, 2, 1, 0, 1,\n", - " 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1,\n", - " 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2, 1, 0, 2, 1, 1, 0, 2, 2,\n", - " 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1,\n", - " 2, 2, 0, 2], dtype=int32)" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.cluster import KMeans\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None)\n", - "X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1)\n", - "cluster.fit(X.values)\n", - "cluster.predict(X.values)" - ] - }, - { - "cell_type": "markdown", - "id": "9bc38836", - "metadata": {}, - "source": [ - "For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate\n", - "features all use `transform` then the do not even need to use these two calls to `make_pipeline`." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "8ceab9b6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptmyclus
01.01
11.01
21.02
31.01
41.02
.........
3951.01
3961.02
3971.02
3981.00
3991.02
\n", - "

400 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " intercept myclus\n", - "0 1.0 1\n", - "1 1.0 1\n", - "2 1.0 2\n", - "3 1.0 1\n", - "4 1.0 2\n", - ".. ... ...\n", - "395 1.0 1\n", - "396 1.0 2\n", - "397 1.0 2\n", - "398 1.0 0\n", - "399 1.0 2\n", - "\n", - "[400 rows x 2 columns]" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n", - " name='myclus', \n", - " encoder=cluster2,\n", - " use_transform=False)\n", - "design = ModelSpec([cluster_var]).fit(Carseats)\n", - "design.transform(Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "1f9b2630", - "metadata": {}, - "source": [ - "Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a\n", - "categorical encoder." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "ffde00a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=(Variable(variables=('Income', 'Price', 'Advertising', 'Population'), name='myclus', encoder=Pipeline(steps=[('standardscaler', StandardScaler()),\n", - " ('kmeans', KMeans(n_clusters=3, random_state=0))]), use_transform=False, pure_columns=False, override_encoder_colnames=True),), name='mynewcat', encoder=Contrast(), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n", - "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n", - " name='myclus', \n", - " encoder=cluster2,\n", - " use_transform=False)\n", - "cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop'))\n", - "cat_cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "5afeab7c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intercept12
01.01.00.0
11.01.00.0
21.00.01.0
31.01.00.0
41.00.01.0
............
3951.01.00.0
3961.00.01.0
3971.00.01.0
3981.00.00.0
3991.00.01.0
\n", - "

400 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " intercept 1 2\n", - "0 1.0 1.0 0.0\n", - "1 1.0 1.0 0.0\n", - "2 1.0 0.0 1.0\n", - "3 1.0 1.0 0.0\n", - "4 1.0 0.0 1.0\n", - ".. ... ... ...\n", - "395 1.0 1.0 0.0\n", - "396 1.0 0.0 1.0\n", - "397 1.0 0.0 1.0\n", - "398 1.0 0.0 0.0\n", - "399 1.0 0.0 1.0\n", - "\n", - "[400 rows x 3 columns]" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([cat_cluster]).fit(Carseats)\n", - "\n", - "design.transform(Carseats)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e24d5637-80fb-49bf-ac10-8ff68cb8bd8f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" - }, - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/models/selection.ipynb b/docs/source/models/selection.ipynb index 3a7d002..fd66d95 100644 --- a/docs/source/models/selection.ipynb +++ b/docs/source/models/selection.ipynb @@ -2,2723 +2,259 @@ "cells": [ { "cell_type": "markdown", - "id": "72bae06a", + "id": "247387ec-1477-42e6-9e69-cad1cacb5721", "metadata": {}, "source": [ - "# Model selection using `ModelSpec`" + "# Model selection using `ModelSpec`\n", + "\n", + "\n", + "In this lab we illustrate how to run forward stepwise model selection\n", + "using the model specification capability of `ModelSpec`." ] }, { "cell_type": "code", "execution_count": 1, - "id": "ae6bd850", + "id": "4720bb2a-6bec-4e91-a57e-9689aa4f0532", "metadata": {}, "outputs": [], "source": [ - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.api import OLS\n", "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5ac10e72", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "80a586d9", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "850356ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "e24def3a", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "edf83080", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "aa22bb9c", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "38d92522", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "cfc2056f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "4674c345", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5688f0ad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" + "from ISLP.models import (ModelSpec,\n", + " Stepwise,\n", + " sklearn_selected)" ] }, { "cell_type": "markdown", - "id": "4ae28ffa", + "id": "1c224240-ce8b-47f3-a85a-052c43038b26", "metadata": {}, "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." + "### Forward Selection\n", + " \n", + "We will apply the forward-selection approach to the `Hitters` \n", + "data. We wish to predict a baseball player’s `Salary` on the\n", + "basis of various statistics associated with performance in the\n", + "previous year." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "5f8926fd", + "execution_count": 2, + "id": "2adc66cc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" + "59" ] }, - "execution_count": 8, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "design.column_info_['ShelveLoc']" + "Hitters = load_data('Hitters')\n", + "np.isnan(Hitters['Salary']).sum()" ] }, { "cell_type": "markdown", - "id": "966f53a5", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a137fa1e", + "id": "40c9a484", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "design.column_info_['OIncome']" + " \n", + " We see that `Salary` is missing for 59 players. The\n", + "`dropna()` method of data frames removes all of the rows that have missing\n", + "values in any variable (by default --- see `Hitters.dropna?`)." ] }, { "cell_type": "code", - "execution_count": 10, - "id": "3390dcb0", + "execution_count": 3, + "id": "1869fdab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" + "(263, 20)" ] }, - "execution_count": 10, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" + "Hitters = Hitters.dropna()\n", + "Hitters.shape" ] }, { "cell_type": "markdown", - "id": "b6667415", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a1b42dbd", + "id": "0a1fe9e6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" + "We first choose the best model using forward selection based on AIC. This score\n", + "is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use\n", + "it as a scorer. By default, `sklearn` tries to maximize a score, hence\n", + " our scoring function computes the negative AIC statistic." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "31367988", + "execution_count": 4, + "id": "76bd8110", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" + "def negAIC(estimator, X, Y):\n", + " \"Negative AIC\"\n", + " n, p = X.shape\n", + " Yhat = estimator.predict(X)\n", + " MSE = np.mean((Y - Yhat)**2)\n", + " return n + n * np.log(MSE) + 2 * (p + 1)\n", + " " ] }, { "cell_type": "markdown", - "id": "751c1487", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "6e2b6155", + "id": "14ba6f49", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "design.terms" + "We need to estimate the residual variance $\\sigma^2$, which is the first argument in our scoring function above.\n", + "We will fit the biggest model, using all the variables, and estimate $\\sigma^2$ based on its MSE." ] }, { "cell_type": "code", - "execution_count": 14, - "id": "d3e669da", + "execution_count": 5, + "id": "94e10f35", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "design.terms_" + "design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters)\n", + "Y = np.array(Hitters['Salary'])\n", + "X = design.transform(Hitters)" ] }, { "cell_type": "markdown", - "id": "fb0a45c9", + "id": "afdda5f2", "metadata": {}, "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" + "Along with a score we need to specify the search strategy. This is done through the object\n", + "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n", + "runs forward stepwise until any further additions to the model do not result\n", + "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n", + "runs a fixed number of steps of stepwise search." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "554c67cb", + "execution_count": 6, + "id": "048c8500", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" + "strategy = Stepwise.first_peak(design,\n", + " direction='forward',\n", + " max_terms=len(design.terms))" ] }, { "cell_type": "markdown", - "id": "06956a6f", + "id": "e0c0af0e", "metadata": {}, "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." + " \n", + "We now fit a linear regression model with `Salary` as outcome using forward\n", + "selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes\n", + "a model from `statsmodels` along with a search strategy and selects a model with its\n", + "`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be\n", + "selected." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "dd434884", + "execution_count": 7, + "id": "26f09fe9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" + "('Assists',\n", + " 'AtBat',\n", + " 'CAtBat',\n", + " 'CHits',\n", + " 'CHmRun',\n", + " 'CRBI',\n", + " 'CRuns',\n", + " 'CWalks',\n", + " 'Division',\n", + " 'Errors',\n", + " 'Hits',\n", + " 'HmRun',\n", + " 'League',\n", + " 'NewLeague',\n", + " 'PutOuts',\n", + " 'RBI',\n", + " 'Runs',\n", + " 'Walks',\n", + " 'Years')" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" + "hitters_MSE = sklearn_selected(OLS,\n", + " strategy)\n", + "hitters_MSE.fit(Hitters, Y)\n", + "hitters_MSE.selected_state_" ] }, { "cell_type": "markdown", - "id": "5cdb088c", + "id": "4acf4792", "metadata": {}, "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." + " Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected." ] }, { "cell_type": "code", - "execution_count": 17, - "id": "519a642e", + "execution_count": 8, + "id": "a825f4d8", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" + "('Assists', 'Errors', 'League', 'NewLeague')" ] }, - "execution_count": 17, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "403921a2", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." + "hitters_Cp = sklearn_selected(OLS,\n", + " strategy,\n", + " scoring=negAIC)\n", + "hitters_Cp.fit(Hitters, Y)\n", + "hitters_Cp.selected_state_" ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b422cde1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "53e38f57", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "6347acb6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "08b5ddb0", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a8eb3e33", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "97912337", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "72b5e629", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "8a457e3e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "8624ab8c", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "6052765e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "9158de59", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "9608bed3", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "f0b8120f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "270a02a6", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "4ffbce7e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "bc5ff62b", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "34dae1e9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "7e9da262", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "938b9430", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "083e9529", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d413a9fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "0f4b508b", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "8bcbd973", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "cf13f72e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "c1fa0a90", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b28aa313", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "aa764acc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "31876a29", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "bac2643c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "1485735d", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "3987c5d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "7a6631c9", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "83a9b94e", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "f0ffabea", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "4a5fdc64", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "ae7e3bd2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "c12ac3df", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "53bf8aef", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "47723bce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "86060622", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "d7a2ab9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "2a5e7f6b", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "bbb02036", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "89106a85", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "151f3fee", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "945ce7bc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "450b94dd", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "18d5c1c8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "46c7d911", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3, )[0] 10.036373\n", - "poly(Income, 3, )[1] -2.799156\n", - "poly(Income, 3, )[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "99bf13a1", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "7606facd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "a4931031", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "1c1bf5f3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 4.240421\n", - "ns(Income, , df=5)[0] 1.468196\n", - "ns(Income, , df=5)[1] 1.499471\n", - "ns(Income, , df=5)[2] 1.152070\n", - "ns(Income, , df=5)[3] 2.418398\n", - "ns(Income, , df=5)[4] 1.804460\n", - "ShelveLoc[Good] 4.810449\n", - "ShelveLoc[Medium] 1.881095\n", - "dtype: float64" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca\n", - "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "8c24254b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n", - " 4.240421 1.468196 1.499471 1.152070 \n", - "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n", - " 2.418398 1.804460 4.810449 1.881095 \n" - ] - } - ], - "source": [ - "%%R\n", - "library(splines)\n", - "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "f9d6c4a7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 3.495085\n", - "bs(Income, , df=7, degree=2)[0] 1.813118\n", - "bs(Income, , df=7, degree=2)[1] 0.961852\n", - "bs(Income, , df=7, degree=2)[2] 2.471545\n", - "bs(Income, , df=7, degree=2)[3] 2.158891\n", - "bs(Income, , df=7, degree=2)[4] 2.091625\n", - "bs(Income, , df=7, degree=2)[5] 2.600669\n", - "bs(Income, , df=7, degree=2)[6] 2.843108\n", - "ShelveLoc[Good] 4.804919\n", - "ShelveLoc[Medium] 1.880337\n", - "dtype: float64" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "0bf1726a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) bs(Income, df = 7, degree = 2)1 \n", - " 3.4950851 1.8131176 \n", - "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n", - " 0.9618523 2.4715450 \n", - "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n", - " 2.1588908 2.0916252 \n", - "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n", - " 2.6006694 2.8431084 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.8049190 1.8803375 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "914df4cf", - "metadata": {}, - "source": [ - "## PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "cc22e780", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.419405\n", - "pca(myvars, , n_components=2)[0] -0.001131\n", - "pca(myvars, , n_components=2)[1] -0.024217\n", - "ShelveLoc[Good] 4.816253\n", - "ShelveLoc[Medium] 1.924139\n", - "dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars'), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "de571e61", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.419405 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n", - " 0.001131 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n", - " -0.024217 \n", - " ShelveLocGood \n", - " 4.816253 \n", - " ShelveLocMedium \n", - " 1.924139 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "0a103b5a", - "metadata": {}, - "source": [ - "It is of course common to scale before running PCA." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "95ca42f5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.352159\n", - "pca(myvars, , n_components=2)[0] 0.446383\n", - "pca(myvars, , n_components=2)[1] -1.219788\n", - "ShelveLoc[Good] 4.922780\n", - "ShelveLoc[Medium] 2.005617\n", - "dtype: float64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars',\n", - " scale=True), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "0dc22e35", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.3522 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n", - " 0.4469 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n", - " -1.2213 \n", - " ShelveLocGood \n", - " 4.9228 \n", - " ShelveLocMedium \n", - " 2.0056 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "70347ee9", - "metadata": {}, - "source": [ - "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n", - "of `np.std(ddof=1)`." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "aa0c2f2e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.44694166, -1.22131519])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))" - ] - }, - { - "cell_type": "markdown", - "id": "ab05c497", - "metadata": {}, - "source": [ - "## Model selection\n", - "\n", - "Another task requiring different design matrices is model selection. Manipulating\n", - "the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`)\n", - "can clearly allow for both exhaustive and stepwise model selection." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "9505c178", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.strategy import (Stepwise, \n", - " min_max)\n", - "from ISLP.models.generic_selector import FeatureSelector" - ] - }, - { - "cell_type": "markdown", - "id": "020c2532", - "metadata": {}, - "source": [ - "### Best subsets" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "f9aba6db", - "metadata": {}, - "outputs": [], - "source": [ - "design = ModelSpec(['Price', \n", - " 'UIncome', \n", - " 'Advertising', \n", - " 'US', \n", - " 'Income',\n", - " 'ShelveLoc',\n", - " 'Education',\n", - " 'Urban']).fit(Carseats)\n", - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "91144a3d", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error')" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "ae3cb2eb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.fit(Carseats, Y)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "e63b2744", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'ShelveLoc')" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "0a774b48", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('UIncome',), ('Advertising',), ('US',), ('Income',), ('ShelveLoc',), ('Education',), ('Urban',), ('Price', 'UIncome'), ('Price', 'Advertising'), ('Price', 'US'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Price', 'Education'), ('Price', 'Urban'), ('UIncome', 'Advertising'), ('UIncome', 'US'), ('UIncome', 'Income'), ('UIncome', 'ShelveLoc'), ('UIncome', 'Education'), ('UIncome', 'Urban'), ('Advertising', 'US'), ('Advertising', 'Income'), ('Advertising', 'ShelveLoc'), ('Advertising', 'Education'), ('Advertising', 'Urban'), ('US', 'Income'), ('US', 'ShelveLoc'), ('US', 'Education'), ('US', 'Urban'), ('Income', 'ShelveLoc'), ('Income', 'Education'), ('Income', 'Urban'), ('ShelveLoc', 'Education'), ('ShelveLoc', 'Urban'), ('Education', 'Urban'), ('Price', 'UIncome', 'Advertising'), ('Price', 'UIncome', 'US'), ('Price', 'UIncome', 'Income'), ('Price', 'UIncome', 'ShelveLoc'), ('Price', 'UIncome', 'Education'), ('Price', 'UIncome', 'Urban'), ('Price', 'Advertising', 'US'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Advertising', 'Education'), ('Price', 'Advertising', 'Urban'), ('Price', 'US', 'Income'), ('Price', 'US', 'ShelveLoc'), ('Price', 'US', 'Education'), ('Price', 'US', 'Urban'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Income', 'Education'), ('Price', 'Income', 'Urban'), ('Price', 'ShelveLoc', 'Education'), ('Price', 'ShelveLoc', 'Urban'), ('Price', 'Education', 'Urban'), ('UIncome', 'Advertising', 'US'), ('UIncome', 'Advertising', 'Income'), ('UIncome', 'Advertising', 'ShelveLoc'), ('UIncome', 'Advertising', 'Education'), ('UIncome', 'Advertising', 'Urban'), ('UIncome', 'US', 'Income'), ('UIncome', 'US', 'ShelveLoc'), ('UIncome', 'US', 'Education'), ('UIncome', 'US', 'Urban'), ('UIncome', 'Income', 'ShelveLoc'), ('UIncome', 'Income', 'Education'), ('UIncome', 'Income', 'Urban'), ('UIncome', 'ShelveLoc', 'Education'), ('UIncome', 'ShelveLoc', 'Urban'), ('UIncome', 'Education', 'Urban'), ('Advertising', 'US', 'Income'), ('Advertising', 'US', 'ShelveLoc'), ('Advertising', 'US', 'Education'), ('Advertising', 'US', 'Urban'), ('Advertising', 'Income', 'ShelveLoc'), ('Advertising', 'Income', 'Education'), ('Advertising', 'Income', 'Urban'), ('Advertising', 'ShelveLoc', 'Education'), ('Advertising', 'ShelveLoc', 'Urban'), ('Advertising', 'Education', 'Urban'), ('US', 'Income', 'ShelveLoc'), ('US', 'Income', 'Education'), ('US', 'Income', 'Urban'), ('US', 'ShelveLoc', 'Education'), ('US', 'ShelveLoc', 'Urban'), ('US', 'Education', 'Urban'), ('Income', 'ShelveLoc', 'Education'), ('Income', 'ShelveLoc', 'Urban'), ('Income', 'Education', 'Urban'), ('ShelveLoc', 'Education', 'Urban')])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "0ca1f28c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'Income')" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=3,\n", - " lower_terms=['Price'],\n", - " upper_terms=['Price', 'Income', 'Advertising'])\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error')\n", - "selector.fit(Carseats, Y)\n", - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "5c6732fa", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([('Price',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'Advertising', 'Income')])" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "markdown", - "id": "7bb6fcc3", - "metadata": {}, - "source": [ - "### Stepwise selection" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "9985d0fc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Advertising', 'Income', 'Price', 'ShelveLoc')" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "strategy = Stepwise.first_peak(design,\n", - " min_terms=0,\n", - " max_terms=6,\n", - " lower_terms=['Price'],\n", - " upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US'\n", - " 'Education', 'Urban'])\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error',\n", - " cv=3)\n", - "selector.fit(Carseats, Y)\n", - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "d3cf3e9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('Price', 'UIncome'), ('Advertising', 'Price'), ('Income', 'Price'), ('Price', 'ShelveLoc'), ('Price', 'Urban'), ('Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Price', 'ShelveLoc'), ('Income', 'Price', 'ShelveLoc'), ('Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc'), ('Advertising', 'Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban')])" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "dd43ea7c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{(): -8.055847677297269,\n", - " ('Price',): -6.514630258019962,\n", - " ('Price', 'UIncome'): -6.621654905418576,\n", - " ('Advertising', 'Price'): -5.825225309857156,\n", - " ('Income', 'Price'): -6.455432795910743,\n", - " ('Price', 'ShelveLoc'): -3.780183168075897,\n", - " ('Price', 'Urban'): -6.5430157266926114,\n", - " ('Price', 'ShelveLoc', 'UIncome'): -3.6938729706475004,\n", - " ('Advertising', 'Price', 'ShelveLoc'): -3.2067316025050645,\n", - " ('Income', 'Price', 'ShelveLoc'): -3.634698914456587,\n", - " ('Price', 'ShelveLoc', 'Urban'): -3.776148947585277,\n", - " ('Advertising', 'Price', 'ShelveLoc', 'UIncome'): -3.1240961493998642,\n", - " ('Advertising', 'Income', 'Price', 'ShelveLoc'): -3.0801704971796244,\n", - " ('Advertising', 'Price', 'ShelveLoc', 'Urban'): -3.207569489139369,\n", - " ('Advertising',\n", - " 'Income',\n", - " 'Price',\n", - " 'ShelveLoc',\n", - " 'UIncome'): -3.1048826894036115,\n", - " ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban'): -3.0867130108677423}" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.results_" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "7c026f0a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Advertising', 'Income', 'Price', 'ShelveLoc')" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "markdown", - "id": "b4b89d04", - "metadata": {}, - "source": [ - "### Enforcing constraints\n", - "\n", - "In models with interactions, we may often want to impose constraints on interactions and main effects.\n", - "This can be achieved here by use of a `validator` that checks whether a given model is valid.\n", - "\n", - "Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless\n", - "`Price` is in the following model." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "1c1e31d0", - "metadata": {}, - "outputs": [], - "source": [ - "design = ModelSpec(['Price', \n", - " 'Advertising', \n", - " 'Income',\n", - " 'ShelveLoc']).fit(Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "be929807", - "metadata": {}, - "source": [ - "The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not\n", - "be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state.\n", - "\n", - "Both `min_max_strategy` and `step_strategy` accept a `validator` argument." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "c075b1b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('Price',), ('Advertising',), ('Income',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Advertising', 'Income'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Advertising', 'Income', 'ShelveLoc')])" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.strategy import validator_from_constraints\n", - "constraints = np.zeros((4, 4))\n", - "constraints[0,3] = 1\n", - "strategy = min_max(design,\n", - " min_terms=0,\n", - " max_terms=4,\n", - " validator=validator_from_constraints(design,\n", - " constraints))\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error',\n", - " cv=3)\n", - "selector.fit(Carseats, Y)\n", - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "3472d47c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Price', 'Advertising', 'Income', 'ShelveLoc')" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "selector.selected_state_" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "5d2c82b9", - "metadata": {}, - "outputs": [], - "source": [ - "Hitters=load_data('Hitters')" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "4b2ac2c2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n", - " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n", - " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n", - " dtype='object')" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hitters.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "bd2ad0dd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys([(), ('AtBat',), ('Hits',), ('HmRun',), ('Runs',), ('RBI',), ('Walks',), ('Years',), ('CAtBat',), ('CHits',), ('CHmRun',), ('CRuns',), ('CRBI',), ('CWalks',), ('League',), ('Division',), ('PutOuts',), ('Assists',), ('Errors',), ('NewLeague',), ('AtBat', 'CRBI'), ('CRBI', 'Hits'), ('CRBI', 'HmRun'), ('CRBI', 'Runs'), ('CRBI', 'RBI'), ('CRBI', 'Walks'), ('CRBI', 'Years'), ('CAtBat', 'CRBI'), ('CHits', 'CRBI'), ('CHmRun', 'CRBI'), ('CRBI', 'CRuns'), ('CRBI', 'CWalks'), ('CRBI', 'League'), ('CRBI', 'Division'), ('CRBI', 'PutOuts'), ('Assists', 'CRBI'), ('CRBI', 'Errors'), ('CRBI', 'NewLeague'), ('AtBat', 'CRBI', 'Hits'), ('CRBI', 'Hits', 'HmRun'), ('CRBI', 'Hits', 'Runs'), ('CRBI', 'Hits', 'RBI'), ('CRBI', 'Hits', 'Walks'), ('CRBI', 'Hits', 'Years'), ('CAtBat', 'CRBI', 'Hits'), ('CHits', 'CRBI', 'Hits'), ('CHmRun', 'CRBI', 'Hits'), ('CRBI', 'CRuns', 'Hits'), ('CRBI', 'CWalks', 'Hits'), ('CRBI', 'Hits', 'League'), ('CRBI', 'Division', 'Hits'), ('CRBI', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits'), ('CRBI', 'Errors', 'Hits'), ('CRBI', 'Hits', 'NewLeague'), ('AtBat', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'League', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years')])" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hitters = Hitters.dropna()\n", - "Y=Hitters['Salary']\n", - "X=Hitters.drop('Salary', axis=1)\n", - "design = ModelSpec(X.columns).fit(X)\n", - "strategy = Stepwise.first_peak(design,\n", - " direction='forward',\n", - " min_terms=0,\n", - " max_terms=19)\n", - "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n", - " strategy,\n", - " scoring='neg_mean_squared_error', cv=None)\n", - "selector.fit(X, Y)\n", - "selector.results_.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "31788748", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "19" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(selector.selected_state_)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "e97d80c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "19" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(X.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a71f0332", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Start: AIC=3215.77\n", - "Salary ~ 1\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CRBI 1 17139434 36179679 3115.8\n", - "+ CRuns 1 16881162 36437951 3117.6\n", - "+ CHits 1 16065140 37253973 3123.5\n", - "+ CAtBat 1 14759710 38559403 3132.5\n", - "+ CHmRun 1 14692193 38626920 3133.0\n", - "+ CWalks 1 12792622 40526491 3145.6\n", - "+ RBI 1 10771083 42548030 3158.4\n", - "+ Walks 1 10504833 42814280 3160.1\n", - "+ Hits 1 10260491 43058621 3161.6\n", - "+ Runs 1 9399158 43919955 3166.8\n", - "+ Years 1 8559105 44760007 3171.7\n", - "+ AtBat 1 8309469 45009644 3173.2\n", - "+ HmRun 1 6273967 47045145 3184.8\n", - "+ PutOuts 1 4814100 48505013 3192.9\n", - "+ Division 1 1976102 51343011 3207.8\n", - " 53319113 3215.8\n", - "+ Assists 1 34497 53284615 3217.6\n", - "+ League 1 10876 53308237 3217.7\n", - "+ Errors 1 1555 53317558 3217.8\n", - "+ NewLeague 1 428 53318684 3217.8\n", - "\n", - "Step: AIC=3115.78\n", - "Salary ~ CRBI\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Hits 1 5533119 30646560 3074.1\n", - "+ Runs 1 5176532 31003147 3077.2\n", - "+ Walks 1 4199733 31979946 3085.3\n", - "+ AtBat 1 4064585 32115095 3086.4\n", - "+ RBI 1 3308272 32871407 3092.6\n", - "+ PutOuts 1 3267035 32912644 3092.9\n", - "+ Division 1 1733887 34445793 3104.9\n", - "+ Years 1 1667339 34512340 3105.4\n", - "+ HmRun 1 1271587 34908092 3108.4\n", - "+ CRuns 1 354561 35825119 3115.2\n", - "+ Assists 1 346020 35833659 3115.2\n", - " 36179679 3115.8\n", - "+ Errors 1 194403 35985276 3116.4\n", - "+ CAtBat 1 92261 36087418 3117.1\n", - "+ CHits 1 75469 36104210 3117.2\n", - "+ CWalks 1 51974 36127705 3117.4\n", - "+ NewLeague 1 17778 36161901 3117.7\n", - "+ League 1 11825 36167855 3117.7\n", - "+ CHmRun 1 515 36179165 3117.8\n", - "\n", - "Step: AIC=3074.13\n", - "Salary ~ CRBI + Hits\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ PutOuts 1 1397263 29249297 3063.8\n", - "+ Division 1 1279275 29367285 3064.9\n", - "+ AtBat 1 821767 29824793 3069.0\n", - "+ Walks 1 781767 29864793 3069.3\n", - "+ Years 1 254910 30391650 3073.9\n", - " 30646560 3074.1\n", - "+ League 1 208880 30437680 3074.3\n", - "+ CRuns 1 132614 30513946 3075.0\n", - "+ NewLeague 1 118474 30528086 3075.1\n", - "+ Runs 1 114198 30532362 3075.1\n", - "+ Errors 1 99776 30546784 3075.3\n", - "+ CAtBat 1 83517 30563043 3075.4\n", - "+ Assists 1 44781 30601779 3075.7\n", - "+ CWalks 1 23668 30622892 3075.9\n", - "+ CHmRun 1 4790 30641769 3076.1\n", - "+ CHits 1 4358 30642202 3076.1\n", - "+ HmRun 1 2173 30644387 3076.1\n", - "+ RBI 1 1137 30645423 3076.1\n", - "\n", - "Step: AIC=3063.85\n", - "Salary ~ CRBI + Hits + PutOuts\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Division 1 1278445 27970852 3054.1\n", - "+ AtBat 1 1009933 28239364 3056.6\n", - "+ Walks 1 539490 28709807 3061.0\n", - "+ CRuns 1 273649 28975648 3063.4\n", - " 29249297 3063.8\n", - "+ Years 1 136906 29112391 3064.6\n", - "+ League 1 122841 29126456 3064.8\n", - "+ Runs 1 117930 29131367 3064.8\n", - "+ Errors 1 97244 29152053 3065.0\n", - "+ NewLeague 1 57839 29191458 3065.3\n", - "+ CHits 1 35096 29214201 3065.5\n", - "+ RBI 1 33965 29215331 3065.6\n", - "+ HmRun 1 31227 29218070 3065.6\n", - "+ CWalks 1 28572 29220725 3065.6\n", - "+ CAtBat 1 20518 29228779 3065.7\n", - "+ Assists 1 1681 29247616 3065.8\n", - "+ CHmRun 1 1419 29247878 3065.8\n", - "\n", - "Step: AIC=3054.1\n", - "Salary ~ CRBI + Hits + PutOuts + Division\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ AtBat 1 820952 27149899 3048.3\n", - "+ Walks 1 491584 27479268 3051.4\n", - " 27970852 3054.1\n", - "+ CRuns 1 193604 27777248 3054.3\n", - "+ Years 1 166845 27804007 3054.5\n", - "+ League 1 110628 27860224 3055.1\n", - "+ Errors 1 81385 27889467 3055.3\n", - "+ Runs 1 65921 27904931 3055.5\n", - "+ RBI 1 53719 27917133 3055.6\n", - "+ NewLeague 1 52275 27918577 3055.6\n", - "+ CHits 1 33863 27936989 3055.8\n", - "+ HmRun 1 26390 27944462 3055.8\n", - "+ CAtBat 1 18751 27952101 3055.9\n", - "+ CWalks 1 5723 27965129 3056.0\n", - "+ Assists 1 1036 27969816 3056.1\n", - "+ CHmRun 1 165 27970687 3056.1\n", - "\n", - "Step: AIC=3048.26\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ Walks 1 954996 26194904 3040.8\n", - "+ Years 1 253362 26896537 3047.8\n", - "+ Runs 1 208743 26941157 3048.2\n", - " 27149899 3048.3\n", - "+ CRuns 1 185825 26964075 3048.5\n", - "+ League 1 95986 27053913 3049.3\n", - "+ NewLeague 1 52693 27097206 3049.8\n", - "+ CHmRun 1 43173 27106726 3049.8\n", - "+ Assists 1 28898 27121001 3050.0\n", - "+ CAtBat 1 20989 27128910 3050.1\n", - "+ CWalks 1 15599 27134301 3050.1\n", - "+ Errors 1 6265 27143634 3050.2\n", - "+ CHits 1 5305 27144594 3050.2\n", - "+ RBI 1 1236 27148663 3050.2\n", - "+ HmRun 1 11 27149888 3050.3\n", - "\n", - "Step: AIC=3040.85\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CWalks 1 240687 25954217 3040.4\n", - " 26194904 3040.8\n", - "+ Years 1 184508 26010396 3041.0\n", - "+ CRuns 1 110695 26084209 3041.7\n", - "+ League 1 77974 26116930 3042.1\n", - "+ Assists 1 75782 26119122 3042.1\n", - "+ NewLeague 1 40909 26153995 3042.4\n", - "+ CHits 1 37304 26157599 3042.5\n", - "+ RBI 1 11728 26183176 3042.7\n", - "+ HmRun 1 4747 26190157 3042.8\n", - "+ Errors 1 2727 26192177 3042.8\n", - "+ CAtBat 1 2630 26192274 3042.8\n", - "+ CHmRun 1 943 26193961 3042.8\n", - "+ Runs 1 37 26194867 3042.8\n", - "\n", - "Step: AIC=3040.42\n", - "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks + CWalks\n", - "\n", - " Df Sum of Sq RSS AIC\n", - "+ CRuns 1 794983 25159234 3034.2\n", - "+ CHits 1 273728 25680489 3039.6\n", - " 25954217 3040.4\n", - "+ Assists 1 138506 25815711 3041.0\n", - "+ CAtBat 1 89289 25864929 3041.5\n", - "+ RBI 1 86941 25867276 3041.5\n", - "+ League 1 77159 25877058 3041.6\n", - "+ Years 1 70126 25884091 3041.7\n", - "+ NewLeague 1 37807 25916410 3042.0\n", - "+ HmRun 1 33601 25920616 3042.1\n", - "+ CHmRun 1 9034 25945183 3042.3\n", - "+ Errors 1 6928" - ] - } - ], - "source": [ - "%%R -i Hitters\n", - "step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6117f650", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "536a8bc3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bddc13c5", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/source/models/spec.ipynb b/docs/source/models/spec.ipynb index d6ba7b0..fce6b32 100644 --- a/docs/source/models/spec.ipynb +++ b/docs/source/models/spec.ipynb @@ -31,7 +31,7 @@ "from ISLP.models import (ModelSpec,\n", " summarize,\n", " Column,\n", - " Variable,\n", + " Feature,\n", " build_columns)\n", "\n", "import statsmodels.api as sm" @@ -257,7 +257,7 @@ "metadata": {}, "source": [ "We note that a column has been added for the intercept by default. This can be changed using the\n", - "`intercept` argument. " + "`intercept` argument." ] }, { @@ -391,7 +391,7 @@ "in the column space of the design matrix.\n", "\n", "To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of\n", - "`R`, we call this a `Contrast` of the categorical variable. " + "`R`, we call this a `Contrast` of the categorical variable." ] }, { @@ -597,14 +597,6 @@ "shelve.get_columns(Carseats)" ] }, - { - "cell_type": "markdown", - "id": "5d8b048f-3c31-47ac-8946-0662f5e57b63", - "metadata": {}, - "source": [ - "shelve.get_columns?" - ] - }, { "cell_type": "markdown", "id": "269e6d18-4ae4-4a77-8498-90281ae7c803", @@ -946,7 +938,7 @@ "\n", "The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood,\n", "this sequence is inspected to produce the `terms_` attribute which specify the objects\n", - "that will ultimately create the design matrix. " + "that will ultimately create the design matrix." ] }, { @@ -958,8 +950,8 @@ { "data": { "text/plain": [ - "[Variable(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" + "[Feature(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", + " Feature(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" ] }, "execution_count": 13, @@ -978,7 +970,7 @@ "id": "warming-mobile", "metadata": {}, "source": [ - "Each element of `terms_` should be a `Variable` which describes a set of columns to be extracted from\n", + "Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from\n", "a columnar data form as well as possible a possible encoder." ] }, @@ -1134,17 +1126,17 @@ "id": "former-spring", "metadata": {}, "source": [ - "### `Variable` objects\n", + "### `Feature` objects\n", "\n", - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", + "Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", + "is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during \n", "the call to `ModelSpec.fit`." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "floral-liabilities", "metadata": {}, "outputs": [ @@ -1263,15 +1255,13 @@ "[400 rows x 3 columns]" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n", + "new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n", "build_columns(MS.column_info_,\n", " Carseats, \n", " new_var)[0]" @@ -1288,18 +1278,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "imported-measure", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/html": [ @@ -1403,7 +1385,7 @@ "[400 rows x 2 columns]" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1412,7 +1394,7 @@ "from sklearn.decomposition import PCA\n", "pca = PCA(n_components=2)\n", "pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n", + "pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n", "build_columns(MS.column_info_,\n", " Carseats, \n", " pca_var)[0]" @@ -1424,23 +1406,15 @@ "metadata": {}, "source": [ "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." + "or `Feature` instances (`pca_var`)." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "western-bloom", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/html": [ @@ -1568,14 +1542,14 @@ "[400 rows x 4 columns]" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "price = MS.column_info_['Price']\n", - "fancy_var = Variable(('Income', price, pca_var), name='fancy', encoder=None)\n", + "fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None)\n", "build_columns(MS.column_info_,\n", " Carseats, \n", " fancy_var)[0]" @@ -1583,121 +1557,95 @@ }, { "cell_type": "markdown", - "id": "absent-branch", + "id": "e289feba-e3f5-48e0-9e29-cdd88d7f9923", "metadata": {}, "source": [ - "## Predicting at new points\n", - "\n", - "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n", - "Constructing the design matrix at any values is carried out by the `transform` method." + "## Predicting at new points" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "naked-hollywood", + "execution_count": 22, + "id": "6efed2fa-9e5d-429c-a8d9-ac544cab2b41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([ 9.73389663, 26.06456997])" + "intercept 12.661546\n", + "Price -0.052213\n", + "Income 0.012829\n", + "dtype: float64" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_data = pd.DataFrame({'Income':['Bad', 'Good'], 'Price':[40, 50]})\n", - "new_X = MS.transform(new_data)\n", - "M_ols.get_prediction(new_X).predicted_mean" + "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n", + "X = MS.transform(Carseats)\n", + "Y = Carseats['Sales']\n", + "M_ols = sm.OLS(Y, X).fit()\n", + "M_ols.params" ] }, { "cell_type": "markdown", - "id": "signal-yahoo", + "id": "e6b4609b-fcb2-4cc2-b630-509df4c87546", "metadata": {}, "source": [ - "## Using `np.ndarray`\n", - "\n", - "As the basic model is to concatenate columns extracted from a columnar data\n", - "representation, one *can* use `np.ndarray` as the column data. In this case,\n", - "columns will be selected by integer indices. \n", - "\n", - "### Caveats using `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "However,\n", - "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n", - "\n", - "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n", - "\n", - "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n", - "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n", - "in using `np.ndarray`." + "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n", + "Constructing the design matrix at any values is carried out by the `transform` method." ] }, { "cell_type": "code", - "execution_count": 42, - "id": "964ecc79-7303-410c-b258-2d58341c7dc0", + "execution_count": 23, + "id": "8784b0e8-ce53-4a90-aee6-b935834295c7", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " intercept Price Income\n", - "0 1.0 40 10\n", - "1 1.0 50 20\n" - ] - }, { "data": { "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" + "array([10.70130676, 10.307465 ])" ] }, - "execution_count": 42, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n", - "M_ols = sm.OLS(Y, MS.transform(Carseats)).fit()\n", - "\n", "new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]})\n", "new_X = MS.transform(new_data)\n", - "print(new_X)\n", - "M_ols.params" + "M_ols.get_prediction(new_X).predicted_mean" ] }, { - "cell_type": "code", - "execution_count": 25, - "id": "a42c239e-a5eb-4c5d-919e-16c4d58d1c8d", + "cell_type": "markdown", + "id": "signal-yahoo", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([10.70130676, 10.307465 ])" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "M_ols.get_prediction(new_X).predicted_mean" + "## Using `np.ndarray`\n", + "\n", + "As the basic model is to concatenate columns extracted from a columnar data\n", + "representation, one *can* use `np.ndarray` as the column data. In this case,\n", + "columns will be selected by integer indices. \n", + "\n", + "### Caveats using `np.ndarray`\n", + "\n", + "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", + "However,\n", + "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n", + "\n", + "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", + "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n", + "\n", + "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n", + "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n", + "in using `np.ndarray`." ] }, { @@ -1710,7 +1658,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 24, "id": "4fec9030-7445-48be-a15f-2ac5a789e717", "metadata": {}, "outputs": [ @@ -1726,7 +1674,7 @@ " [ 1., 120., 37.]])" ] }, - "execution_count": 34, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1739,7 +1687,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "id": "c864e365-2476-4ca6-9d27-625cac2b2271", "metadata": {}, "outputs": [ @@ -1752,7 +1700,7 @@ "dtype: float64" ] }, - "execution_count": 36, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1779,7 +1727,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 26, "id": "incredible-concert", "metadata": {}, "outputs": [ @@ -1813,7 +1761,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 27, "id": "stunning-container", "metadata": {}, "outputs": [ @@ -1831,7 +1779,7 @@ "array([10.70130676, 10.307465 ])" ] }, - "execution_count": 45, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1840,7 +1788,7 @@ "new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T\n", "new_X = MS_np.transform(new_D)\n", "print(new_X)\n", - "M_ols.get_prediction(new_X).predicted_mean\n" + "M_ols.get_prediction(new_X).predicted_mean" ] }, { @@ -1855,10 +1803,10 @@ ], "metadata": { "jupytext": { - "formats": "ipynb" + "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" }, "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1872,7 +1820,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/docs/source/models/submodels.ipynb b/docs/source/models/submodels.ipynb deleted file mode 100644 index 825bedd..0000000 --- a/docs/source/models/submodels.ipynb +++ /dev/null @@ -1,3127 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ee33d364", - "metadata": {}, - "source": [ - "# Building design matrices with `ModelSpec`\n", - "\n", - "Force rebuild" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "4c70fbaa", - "metadata": {}, - "outputs": [], - "source": [ - "x=4\n", - "import numpy as np, pandas as pd\n", - "%load_ext rpy2.ipython\n", - "\n", - "from ISLP import load_data\n", - "from ISLP.models import ModelSpec\n", - "\n", - "import statsmodels.api as sm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8a708215", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n", - " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n", - " dtype='object')" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats = load_data('Carseats')\n", - "%R -i Carseats\n", - "Carseats.columns" - ] - }, - { - "cell_type": "markdown", - "id": "dad5e991", - "metadata": {}, - "source": [ - "## Let's break up income into groups" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ac7086a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: OIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L' < 'M' < 'H']" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'])\n", - "Carseats['OIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "261446c8", - "metadata": {}, - "source": [ - "Let's also create an unordered version" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "674bb806", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 M\n", - "1 L\n", - "2 L\n", - "3 H\n", - "4 M\n", - " ..\n", - "395 H\n", - "396 L\n", - "397 L\n", - "398 M\n", - "399 L\n", - "Name: UIncome, Length: 400, dtype: category\n", - "Categories (3, object): ['L', 'M', 'H']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n", - " [0,50,90,200], \n", - " labels=['L','M','H'],\n", - " ordered=False)\n", - "Carseats['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "8f030039", - "metadata": {}, - "source": [ - "## A simple model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "40cd6c28", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Price', 'Income'], dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Price', 'Income'])\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e65f5607", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 12.661546\n", - "Price -0.052213\n", - "Income 0.012829\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Y = Carseats['Sales']\n", - "M = sm.OLS(Y, X).fit()\n", - "M.params" - ] - }, - { - "cell_type": "markdown", - "id": "29d9b55f", - "metadata": {}, - "source": [ - "## Basic procedure\n", - "\n", - "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n", - "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n", - "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cfbe5b92", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Bad\n", - "1 Good\n", - "2 Medium\n", - "3 Medium\n", - "4 Bad\n", - " ... \n", - "395 Good\n", - "396 Medium\n", - "397 Medium\n", - "398 Bad\n", - "399 Good\n", - "Name: ShelveLoc, Length: 400, dtype: category\n", - "Categories (3, object): ['Bad', 'Good', 'Medium']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "7092f666", - "metadata": {}, - "source": [ - "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n", - "`get_columns` and `fit_encoder`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e2d43844", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['ShelveLoc']" - ] - }, - { - "cell_type": "markdown", - "id": "46a01612", - "metadata": {}, - "source": [ - "It recognized ordinal columns as well." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "465a9326", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['OIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "76f8480d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 73, 48, 35, 100]), ('Income',))" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "income = design.column_info_['Income']\n", - "cols, names = income.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "markdown", - "id": "25fcc1de", - "metadata": {}, - "source": [ - "## Encoding a column\n", - "\n", - "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n", - "variables usually are encoded by several columns, typically one less than the number of categories.\n", - "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n", - "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n", - "its encoder the first time data is passed to it." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "dfe6cc35", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([[0., 0.],\n", - " [1., 0.],\n", - " [0., 1.],\n", - " [0., 1.]]),\n", - " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shelve = design.column_info_['ShelveLoc']\n", - "cols, names = shelve.get_columns(Carseats)\n", - "(cols[:4], names)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8fc9779a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[2.],\n", - " [1.],\n", - " [1.],\n", - " [0.]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "oincome = design.column_info_['OIncome']\n", - "oincome.get_columns(Carseats)[0][:4]" - ] - }, - { - "cell_type": "markdown", - "id": "8e04da60", - "metadata": {}, - "source": [ - "## The terms\n", - "\n", - "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n", - "the `terms` argument which should be a sequence. The elements of `terms` are often\n", - "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n", - "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c579dbce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Price', 'Income']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4587b8bd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "2595f0fa", - "metadata": {}, - "source": [ - "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n", - "`Variable` can also create columns through the `build_columns` method of `ModelSpec`" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "03bd9366", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price\n", - " 0 120\n", - " 1 83\n", - " 2 80\n", - " 3 97\n", - " 4 128\n", - " .. ...\n", - " 395 128\n", - " 396 120\n", - " 397 159\n", - " 398 95\n", - " 399 120\n", - " \n", - " [400 rows x 1 columns],\n", - " ['Price'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "price = design.terms_[0]\n", - "design.build_columns(Carseats, price)" - ] - }, - { - "cell_type": "markdown", - "id": "de04ca48", - "metadata": {}, - "source": [ - "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n", - "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n", - "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n", - "the call to `ModelSpec.fit`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a42af4c5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( Price Income UIncome[L] UIncome[M]\n", - " 0 120.0 73.0 0.0 1.0\n", - " 1 83.0 48.0 1.0 0.0\n", - " 2 80.0 35.0 1.0 0.0\n", - " 3 97.0 100.0 0.0 0.0\n", - " 4 128.0 64.0 0.0 1.0\n", - " .. ... ... ... ...\n", - " 395 128.0 108.0 0.0 0.0\n", - " 396 120.0 23.0 1.0 0.0\n", - " 397 159.0 26.0 1.0 0.0\n", - " 398 95.0 79.0 0.0 1.0\n", - " 399 120.0 37.0 1.0 0.0\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import Variable\n", - "\n", - "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n", - "design.build_columns(Carseats, new_var)" - ] - }, - { - "cell_type": "markdown", - "id": "b146d0c0", - "metadata": {}, - "source": [ - "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n", - "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b6c394a6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( mynewvar[0] mynewvar[1]\n", - " 0 -3.608693 -4.853177\n", - " 1 15.081506 35.708630\n", - " 2 27.422871 40.774250\n", - " 3 -33.973209 13.470489\n", - " 4 6.567316 -11.290100\n", - " .. ... ...\n", - " 395 -36.846346 -18.415783\n", - " 396 45.741500 3.245602\n", - " 397 49.097533 -35.725355\n", - " 398 -13.577772 18.845139\n", - " 399 31.927566 0.978436\n", - " \n", - " [400 rows x 2 columns],\n", - " ['mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=2)\n", - "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n", - "design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "markdown", - "id": "3bb30a3f", - "metadata": {}, - "source": [ - "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n", - "or `Variable` instances (`pca_var`)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "ea7770ff", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( Price Price mynewvar[0] mynewvar[1]\n", - " 0 120.0 120.0 -3.608693 -4.853177\n", - " 1 83.0 83.0 15.081506 35.708630\n", - " 2 80.0 80.0 27.422871 40.774250\n", - " 3 97.0 97.0 -33.973209 13.470489\n", - " 4 128.0 128.0 6.567316 -11.290100\n", - " .. ... ... ... ...\n", - " 395 128.0 128.0 -36.846346 -18.415783\n", - " 396 120.0 120.0 45.741500 3.245602\n", - " 397 159.0 159.0 49.097533 -35.725355\n", - " 398 95.0 95.0 -13.577772 18.845139\n", - " 399 120.0 120.0 31.927566 0.978436\n", - " \n", - " [400 rows x 4 columns],\n", - " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n", - "design.build_columns(Carseats, fancy_var)" - ] - }, - { - "cell_type": "markdown", - "id": "b2b4a01a", - "metadata": {}, - "source": [ - "We can of course run PCA again on these features (if we wanted)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "21ad8b44", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "( fancy_pca[0] fancy_pca[1]\n", - " 0 -6.951792 4.859283\n", - " 1 55.170148 -24.694875\n", - " 2 59.418556 -38.033572\n", - " 3 34.722389 28.922184\n", - " 4 -21.419184 -3.120673\n", - " .. ... ...\n", - " 395 -18.257348 40.760122\n", - " 396 -10.546709 -45.021658\n", - " 397 -77.706359 -37.174379\n", - " 398 36.668694 7.730851\n", - " 399 -9.540535 -31.059122\n", - " \n", - " [400 rows x 2 columns],\n", - " ['fancy_pca[0]', 'fancy_pca[1]'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pca2 = PCA(n_components=2)\n", - "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n", - "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n", - "design.build_columns(Carseats, pca2_var)" - ] - }, - { - "cell_type": "markdown", - "id": "2262377d", - "metadata": {}, - "source": [ - "## Building the design matrix\n", - "\n", - "With these notions in mind, the final design is essentially then" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "1654ca47", - "metadata": {}, - "outputs": [], - "source": [ - "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "1db0e0a9", - "metadata": {}, - "source": [ - "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n", - "a dataframe the index is adjusted accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "d20e8ea8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.intercept" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "450fe910", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptPriceIncome
01.012073
11.08348
21.08035
31.097100
\n", - "
" - ], - "text/plain": [ - " intercept Price Income\n", - "0 1.0 120 73\n", - "1 1.0 83 48\n", - "2 1.0 80 35\n", - "3 1.0 97 100" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.transform(Carseats)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "0705ba6f", - "metadata": {}, - "source": [ - "## Predicting\n", - "\n", - "Constructing the design matrix at any values is carried out by the `transform` method." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "866c2863", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n", - "new_X = design.transform(new_data)\n", - "M.get_prediction(new_X).predicted_mean" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "f2021166", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0 1 \n", - "12.65258 12.25873 \n" - ] - } - ], - "source": [ - "%%R -i new_data,Carseats\n", - "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "20e1a31a", - "metadata": {}, - "source": [ - "### Difference between using `pd.DataFrame` and `np.ndarray`\n", - "\n", - "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n", - "\n", - "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n", - "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "a5926ec9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1.0, 120, 73],\n", - " [1.0, 83, 48],\n", - " [1.0, 80, 35],\n", - " [1.0, 97, 100]], dtype=object)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n", - "design_np = ModelSpec([0,3]).fit(Carseats_np)\n", - "design_np.transform(Carseats_np)[:4]" - ] - }, - { - "cell_type": "markdown", - "id": "997a63cb", - "metadata": {}, - "source": [ - "The following will fail for hopefully obvious reasons" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "40410c48", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n" - ] - } - ], - "source": [ - "try:\n", - " new_D = np.zeros((2,2))\n", - " new_D[:,0] = [10,20]\n", - " new_D[:,1] = [40,50]\n", - " M.get_prediction(new_D).predicted_mean\n", - "except ValueError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "920203e9", - "metadata": {}, - "source": [ - "Ultimately, `M` expects 3 columns for new predictions because it was fit\n", - "with a matrix having 3 columns (the first representing an intercept).\n", - "\n", - "We might be tempted to try as with the `pd.DataFrame` and produce\n", - "an `np.ndarray` with only the necessary variables." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "1061da77", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "index 3 is out of bounds for axis 1 with size 2\n" - ] - } - ], - "source": [ - "try:\n", - " new_X = np.zeros((2,2))\n", - " new_X[:,0] = [10,20]\n", - " new_X[:,1] = [40,50]\n", - " new_D = design_np.transform(new_X)\n", - " M.get_prediction(new_D).predicted_mean\n", - "except IndexError as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "id": "c6bfe001", - "metadata": {}, - "source": [ - "This fails because `design_np` is looking for column `3` from its `terms`:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "5ae6d25f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n", - " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design_np.terms_" - ] - }, - { - "cell_type": "markdown", - "id": "edd7ebeb", - "metadata": {}, - "source": [ - "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n", - "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "9455e532", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([12.65257604, 12.25873428])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_X = np.zeros((2,4))\n", - "new_X[:,0] = [10,20]\n", - "new_X[:,3] = [40,50]\n", - "new_D = design_np.transform(new_X)\n", - "M.get_prediction(new_D).predicted_mean" - ] - }, - { - "cell_type": "markdown", - "id": "fd726791", - "metadata": {}, - "source": [ - "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n", - "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case." - ] - }, - { - "cell_type": "markdown", - "id": "967d9ebc", - "metadata": {}, - "source": [ - "## A model with some categorical variables\n", - "\n", - "Categorical variables become `Column` instances with encoders." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d0429b56", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n", - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "415e3fd0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "8a99c3a5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 11.876012\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[L] -1.042297\n", - "UIncome[M] -0.119123\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "9250a28a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "fe90c12c", - "metadata": {}, - "source": [ - "## Getting the encoding you want\n", - "\n", - "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n", - "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n", - "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n", - "where `new_encoder` would somehow drop the column we want dropped. \n", - "\n", - "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "0546ec84", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.column_info_['UIncome']" - ] - }, - { - "cell_type": "markdown", - "id": "6ec4fe65", - "metadata": {}, - "source": [ - "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n", - "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n", - "and we need a way to allow different encodings of the same column of `Carseats`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "61e7f56e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "802ed854", - "metadata": {}, - "source": [ - " We can create a new \n", - "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "82d7a01d", - "metadata": {}, - "outputs": [], - "source": [ - "from ISLP.models.model_spec import contrast\n", - "pref_encoding = contrast('UIncome', 'drop', 'L')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "e26849a1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[M] UIncome[H]\n", - " 0 1.0 0.0\n", - " 1 0.0 0.0\n", - " 2 0.0 0.0\n", - " 3 0.0 1.0\n", - " 4 1.0 0.0\n", - " .. ... ...\n", - " 395 0.0 1.0\n", - " 396 0.0 0.0\n", - " 397 0.0 0.0\n", - " 398 1.0 0.0\n", - " 399 0.0 0.0\n", - " \n", - " [400 rows x 2 columns],\n", - " ['UIncome[M]', 'UIncome[H]'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, pref_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "2fc4cd8c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n", - " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " dtype='object')" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n", - "X = design.fit_transform(Carseats)\n", - "X.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "49e33d41", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 10.833715\n", - "Population 0.001163\n", - "Price -0.055725\n", - "UIncome[M] 0.923174\n", - "UIncome[H] 1.042297\n", - "ShelveLoc[Good] 4.999623\n", - "ShelveLoc[Medium] 1.964278\n", - "dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "ce018fdf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) Population Price UIncomeM UIncomeH \n", - " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.99962319 1.96427771 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "2d42b822", - "metadata": {}, - "source": [ - "## Interactions\n", - "\n", - "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n", - "to `ModelSpec`." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "fbb3e3ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 7.866634\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "UIncome[L] -2.734895\n", - "UIncome[M] -2.619745\n", - "dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "f9a7d4ad", - "metadata": {}, - "source": [ - "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n", - "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "5a6f8e69", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n", - " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n", - " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n", - " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.terms_[0]" - ] - }, - { - "cell_type": "markdown", - "id": "98eef5c8", - "metadata": {}, - "source": [ - "Comparing this to the previous `R` model." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "58c99601", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) UIncomeM UIncomeH \n", - " 5.1317 0.1151 1.1561 \n", - " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n", - " 4.5121 5.5752 3.7381 \n", - "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n", - " 1.2473 2.4782 1.5141 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "9c979d7e", - "metadata": {}, - "source": [ - "We note a few important things:\n", - "\n", - "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n", - "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n", - "\n", - "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n", - "\n", - "A few notes:\n", - "\n", - "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n", - "\n", - "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "0cb3b63a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "( UIncome[H] UIncome[L] UIncome[M]\n", - " 0 0.0 0.0 1.0\n", - " 1 0.0 1.0 0.0\n", - " 2 0.0 1.0 0.0\n", - " 3 1.0 0.0 0.0\n", - " 4 0.0 0.0 1.0\n", - " .. ... ... ...\n", - " 395 1.0 0.0 0.0\n", - " 396 0.0 1.0 0.0\n", - " 397 0.0 1.0 0.0\n", - " 398 0.0 0.0 1.0\n", - " 399 0.0 1.0 0.0\n", - " \n", - " [400 rows x 3 columns],\n", - " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_encoding = contrast('UIncome', None)\n", - "design.build_columns(Carseats, full_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "272098d7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.131739\n", - "UIncome[M] 0.115150\n", - "UIncome[H] 1.156118\n", - "UIncome[H]:ShelveLoc[Good] 3.738052\n", - "UIncome[H]:ShelveLoc[Medium] 1.514104\n", - "UIncome[L]:ShelveLoc[Good] 4.512054\n", - "UIncome[L]:ShelveLoc[Medium] 1.247275\n", - "UIncome[M]:ShelveLoc[Good] 5.575170\n", - "UIncome[M]:ShelveLoc[Medium] 2.478163\n", - "dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "fe05c471", - "metadata": {}, - "source": [ - "## Special encodings\n", - "\n", - "For flexible models, we may want to consider transformations of features, i.e. polynomial\n", - "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n", - "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n", - "package includes a `Poly` transform" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "67062299", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import poly\n", - "poly('Income', 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "df5e5b4d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 5.440077\n", - "poly(Income, 3, )[0] 10.036373\n", - "poly(Income, 3, )[1] -2.799156\n", - "poly(Income, 3, )[2] 2.399601\n", - "ShelveLoc[Good] 4.808133\n", - "ShelveLoc[Medium] 1.889533\n", - "dtype: float64" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "markdown", - "id": "01be9c13", - "metadata": {}, - "source": [ - "Compare:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "3244d6f6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n", - " 5.440077 10.036373 -2.799156 2.399601 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.808133 1.889533 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "8ad5bb1d", - "metadata": {}, - "source": [ - "## Splines\n", - "\n", - "Support for natural and B-splines is also included" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "6a6f4358", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 4.240421\n", - "ns(Income, , df=5)[0] 1.468196\n", - "ns(Income, , df=5)[1] 1.499471\n", - "ns(Income, , df=5)[2] 1.152070\n", - "ns(Income, , df=5)[3] 2.418398\n", - "ns(Income, , df=5)[4] 1.804460\n", - "ShelveLoc[Good] 4.810449\n", - "ShelveLoc[Medium] 1.881095\n", - "dtype: float64" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ISLP.models.model_spec import ns, bs, pca\n", - "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "fb740953", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n", - " 4.240421 1.468196 1.499471 1.152070 \n", - "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n", - " 2.418398 1.804460 4.810449 1.881095 \n" - ] - } - ], - "source": [ - "%%R\n", - "library(splines)\n", - "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "fe1bf7fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 3.495085\n", - "bs(Income, , df=7, degree=2)[0] 1.813118\n", - "bs(Income, , df=7, degree=2)[1] 0.961852\n", - "bs(Income, , df=7, degree=2)[2] 2.471545\n", - "bs(Income, , df=7, degree=2)[3] 2.158891\n", - "bs(Income, , df=7, degree=2)[4] 2.091625\n", - "bs(Income, , df=7, degree=2)[5] 2.600669\n", - "bs(Income, , df=7, degree=2)[6] 2.843108\n", - "ShelveLoc[Good] 4.804919\n", - "ShelveLoc[Medium] 1.880337\n", - "dtype: float64" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "86e966e0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (Intercept) bs(Income, df = 7, degree = 2)1 \n", - " 3.4950851 1.8131176 \n", - "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n", - " 0.9618523 2.4715450 \n", - "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n", - " 2.1588908 2.0916252 \n", - "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n", - " 2.6006694 2.8431084 \n", - " ShelveLocGood ShelveLocMedium \n", - " 4.8049190 1.8803375 \n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef" - ] - }, - { - "cell_type": "markdown", - "id": "877d4784", - "metadata": {}, - "source": [ - "## PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "8ba6cb20", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.419405\n", - "pca(myvars, , n_components=2)[0] -0.001131\n", - "pca(myvars, , n_components=2)[1] -0.024217\n", - "ShelveLoc[Good] 4.816253\n", - "ShelveLoc[Medium] 1.924139\n", - "dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars'), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "f0319e51", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.419405 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n", - " 0.001131 \n", - "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n", - " -0.024217 \n", - " ShelveLocGood \n", - " 4.816253 \n", - " ShelveLocMedium \n", - " 1.924139 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "1f55086a", - "metadata": {}, - "source": [ - "It is of course common to scale before running PCA." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "bbe9e004", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "intercept 5.352159\n", - "pca(myvars, , n_components=2)[0] 0.446383\n", - "pca(myvars, , n_components=2)[1] -1.219788\n", - "ShelveLoc[Good] 4.922780\n", - "ShelveLoc[Medium] 2.005617\n", - "dtype: float64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pca(['Income', \n", - " 'Price', \n", - " 'Advertising', \n", - " 'Population'], \n", - " n_components=2, \n", - " name='myvars',\n", - " scale=True), 'ShelveLoc'])\n", - "X = design.fit_transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "d78c02e4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n", - " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n", - "\n", - "Coefficients:\n", - " (Intercept) \n", - " 5.3522 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n", - " 0.4469 \n", - "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n", - " -1.2213 \n", - " ShelveLocGood \n", - " 4.9228 \n", - " ShelveLocMedium \n", - " 2.0056 \n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)" - ] - }, - { - "cell_type": "markdown", - "id": "8a03c603", - "metadata": {}, - "source": [ - "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n", - "of `np.std(ddof=1)`." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "f8215cef", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.44694166, -1.22131519])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))" - ] - }, - { - "cell_type": "markdown", - "id": "a15d0ead", - "metadata": {}, - "source": [ - "## Submodels\n", - "\n", - "We can build submodels as well, even if the terms do not appear in the original `terms` argument.\n", - "Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be\n", - "able to build a design matrix. The initial inspection of the columns of `Carseats` has created\n", - "a column for `US`, hence we can build this submodel." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "d58c6244", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptUS[Yes]
01.01.0
11.01.0
21.01.0
31.01.0
41.00.0
.........
3951.01.0
3961.01.0
3971.01.0
3981.01.0
3991.01.0
\n", - "

400 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " intercept US[Yes]\n", - "0 1.0 1.0\n", - "1 1.0 1.0\n", - "2 1.0 1.0\n", - "3 1.0 1.0\n", - "4 1.0 0.0\n", - ".. ... ...\n", - "395 1.0 1.0\n", - "396 1.0 1.0\n", - "397 1.0 1.0\n", - "398 1.0 1.0\n", - "399 1.0 1.0\n", - "\n", - "[400 rows x 2 columns]" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats)\n", - "design.build_submodel(Carseats, ['US'])" - ] - }, - { - "cell_type": "markdown", - "id": "9365ba27", - "metadata": {}, - "source": [ - "## ANOVA \n", - "\n", - "For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "332ab454", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['intercept'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price'], dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n", - " 'UIncome[L]', 'UIncome[M]'],\n", - " dtype='object')\n", - "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n", - " 'UIncome[L]', 'UIncome[M]', 'US[Yes]'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats)\n", - "for D in design.build_sequence(Carseats):\n", - " print(D.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "f6cfd031", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.02172.7435552.01009.531143153.0108585.452815e-50
2396.01455.6407021.0717.102853217.3771921.583751e-39
3394.01378.9159382.076.72476411.6288851.239031e-05
4393.01296.4627001.082.45323824.9942578.678832e-07
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 2172.743555 2.0 1009.531143 153.010858 5.452815e-50\n", - "2 396.0 1455.640702 1.0 717.102853 217.377192 1.583751e-39\n", - "3 394.0 1378.915938 2.0 76.724764 11.628885 1.239031e-05\n", - "4 393.0 1296.462700 1.0 82.453238 24.994257 8.678832e-07" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "11c4aee8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "ShelveLoc 2 1009.53 504.77 153.011 < 2.2e-16 ***\n", - "Price 1 717.10 717.10 217.377 < 2.2e-16 ***\n", - "UIncome 2 76.72 38.36 11.629 1.240e-05 ***\n", - "US 1 82.45 82.45 24.994 8.679e-07 ***\n", - "Residuals 393 1296.46 3.30 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "9a4e6e63", - "metadata": {}, - "source": [ - "Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of \n", - "interaction as `R` does:" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "6e7bf361", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1393.02059.3764136.01122.89828435.9400471.175738e-34
2391.02036.0445962.023.3318172.2403101.077900e-01
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 393.0 2059.376413 6.0 1122.898284 35.940047 1.175738e-34\n", - "2 391.0 2036.044596 2.0 23.331817 2.240310 1.077900e-01" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats)\n", - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "ed7d4bfa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "UIncome 2 61.92 30.962 5.9458 0.002859 ** \n", - "UIncome:ShelveLoc 6 1084.31 180.718 34.7049 < 2.2e-16 ***\n", - "Residuals 391 2036.04 5.207 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "0350da34", - "metadata": {}, - "source": [ - "To agree with `R` we must order `terms` as `R` will." - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "5ddaf87c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.03120.3513822.061.9233165.9458462.855424e-03
2391.02036.0445966.01084.30678534.7048681.346561e-33
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 3120.351382 2.0 61.923316 5.945846 2.855424e-03\n", - "2 391.0 2036.044596 6.0 1084.306785 34.704868 1.346561e-33" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats)\n", - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "1ef70ce3", - "metadata": {}, - "source": [ - "## More complicated interactions\n", - "\n", - "Can we have an interaction of a polynomial effect with a categorical? Absolutely" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "a1a14742", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "UIncome 2 61.92 30.9617 4.0310 0.01851 *\n", - "UIncome:poly(Income, 3) 9 79.72 8.8581 1.1533 0.32408 \n", - "UIncome:US 3 83.51 27.8367 3.6242 0.01324 *\n", - "Residuals 385 2957.12 7.6808 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "a909be1a", - "metadata": {}, - "source": [ - "To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels\n", - "for the two interactions." - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "ae286cf3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "intercept 65.978856\n", - "UIncome[M] -60.159607\n", - "UIncome[H] -147.276154\n", - "poly(Income, 3, )[0]:UIncome[H] 1957.694387\n", - "poly(Income, 3, )[0]:UIncome[L] 1462.060650\n", - "poly(Income, 3, )[0]:UIncome[M] 83.035153\n", - "poly(Income, 3, )[1]:UIncome[H] -984.494570\n", - "poly(Income, 3, )[1]:UIncome[L] 881.537647\n", - "poly(Income, 3, )[1]:UIncome[M] -18.006234\n", - "poly(Income, 3, )[2]:UIncome[H] 207.614692\n", - "poly(Income, 3, )[2]:UIncome[L] 217.190749\n", - "poly(Income, 3, )[2]:UIncome[M] 34.065434\n", - "UIncome[H]:US 0.903404\n", - "UIncome[L]:US 0.895538\n", - "UIncome[M]:US 1.048728\n", - "dtype: float64" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p3 = poly('Income', 3)\n", - "design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats)\n", - "X = design.transform(Carseats)\n", - "sm.OLS(Y, X).fit().params" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "236ab2d2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1397.03120.3513822.061.9233164.0310320.018488
2388.03040.6285599.079.7228231.1532730.324049
3385.02957.1184443.083.5101153.6241810.013244
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 397.0 3120.351382 2.0 61.923316 4.031032 0.018488\n", - "2 388.0 3040.628559 9.0 79.722823 1.153273 0.324049\n", - "3 385.0 2957.118444 3.0 83.510115 3.624181 0.013244" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "0a45c720", - "metadata": {}, - "source": [ - "## Grouping columns for ANOVA\n", - "\n", - "The `Variable` construct can be used to group\n", - "variables together to get custom sequences of models for `anova_lm`." - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "f36c1b3b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['intercept'], dtype='object')\n", - "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]'], dtype='object')\n", - "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]', 'US[Yes]',\n", - " 'Advertising'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "group1 = Variable(('Price', pref_encoding), 'group1', None)\n", - "group2 = Variable(('US', 'Advertising'), 'group2', None)\n", - "design = ModelSpec([group1, group2]).fit(Carseats)\n", - "for D in design.build_sequence(Carseats):\n", - " print(D.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "3daf7638", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_residssrdf_diffss_diffFPr(>F)
0399.03182.2746980.0NaNNaNNaN
1396.02508.1877883.0674.08691039.3048412.970412e-22
2394.02252.3963432.0255.79144522.3721356.267562e-10
\n", - "
" - ], - "text/plain": [ - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 399.0 3182.274698 0.0 NaN NaN NaN\n", - "1 396.0 2508.187788 3.0 674.086910 39.304841 2.970412e-22\n", - "2 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))" - ] - }, - { - "cell_type": "markdown", - "id": "46c1ace8", - "metadata": {}, - "source": [ - "It is not clear this is simple to do in `R` as the formula object expands all parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "0b87e430", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Response: Sales\n", - " Df Sum Sq Mean Sq F value Pr(>F) \n", - "Price 1 630.03 630.03 110.2079 < 2.2e-16 ***\n", - "UIncome 2 44.06 22.03 3.8533 0.02201 * \n", - "US 1 121.88 121.88 21.3196 5.270e-06 ***\n", - "Advertising 1 133.91 133.91 23.4247 1.868e-06 ***\n", - "Residuals 394 2252.40 5.72 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats))" - ] - }, - { - "cell_type": "markdown", - "id": "7c137360", - "metadata": {}, - "source": [ - "It can be done by building up the models\n", - "by hand and likely is possible to be done programmatically but it seems not obvious." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "b678d323", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ 1\n", - "Model 2: Sales ~ Price + UIncome\n", - "Model 3: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 399 3182.3 \n", - "2 396 2508.2 3 674.09 39.305 < 2.2e-16 ***\n", - "3 394 2252.4 2 255.79 22.372 6.268e-10 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ 1, data=Carseats)\n", - "M2 = lm(Sales ~ Price + UIncome, data=Carseats)\n", - "M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "anova(M1, M2, M3)" - ] - }, - { - "cell_type": "markdown", - "id": "b0388949", - "metadata": {}, - "source": [ - "## Alternative anova\n", - "\n", - "Another common ANOVA table involves dropping each term in succession from the model and comparing\n", - "to the full model." - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "ac5b916a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'intercept'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 395.0 4417.273517 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 1.0 2164.877175 378.690726 1.359177e-59\n", - "{'Price', 'UIncome[H]', 'UIncome[M]'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 397.0 2950.808154 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 3.0 698.411811 40.723184 6.077848e-23\n", - "{'US[Yes]', 'Advertising'}\n", - " df_resid ssr df_diff ss_diff F Pr(>F)\n", - "0 396.0 2508.187788 0.0 NaN NaN NaN\n", - "1 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10\n" - ] - } - ], - "source": [ - "Dfull = design.transform(Carseats)\n", - "Mfull = sm.OLS(Y, Dfull).fit()\n", - "for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')):\n", - " if i == 0:\n", - " D0 = D\n", - " print(set(D.columns) ^ set(Dfull.columns))\n", - " print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull))" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "a0c71948", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ US + Advertising\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 397 2950.8 \n", - "2 394 2252.4 3 698.41 40.723 < 2.2e-16 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ Price + UIncome\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F) \n", - "1 396 2508.2 \n", - "2 394 2252.4 2 255.79 22.372 6.268e-10 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "M2 = lm(Sales ~ US + Advertising, data=Carseats)\n", - "print(anova(M2, M1))\n", - "M3 = lm(Sales ~ Price + UIncome, data=Carseats)\n", - "print(anova(M3, M1))" - ] - }, - { - "cell_type": "markdown", - "id": "a5e4880d", - "metadata": {}, - "source": [ - "The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection\n", - "of the formula." - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "4b383401", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Analysis of Variance Table\n", - "\n", - "Model 1: Sales ~ Price + UIncome + US + Advertising - 1\n", - "Model 2: Sales ~ Price + UIncome + US + Advertising\n", - " Res.Df RSS Df Sum of Sq F Pr(>F)\n", - "1 394 2252.4 \n", - "2 394 2252.4 0 9.0949e-13 \n" - ] - } - ], - "source": [ - "%%R\n", - "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n", - "M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats)\n", - "print(anova(M4, M1))" - ] - }, - { - "cell_type": "markdown", - "id": "72d7c83b", - "metadata": {}, - "source": [ - "It can be found with `summary`." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "4d5ce789", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Call:\n", - "lm(formula = Sales ~ Price + UIncome + US + Advertising, data = Carseats)\n", - "\n", - "Residuals:\n", - " Min 1Q Median 3Q Max \n", - "-7.4437 -1.6351 -0.0932 1.4920 6.8076 \n", - "\n", - "Coefficients:\n", - " Estimate Std. Error t value Pr(>|t|) \n", - "(Intercept) 12.520356 0.643390 19.460 < 2e-16 ***\n", - "Price -0.054000 0.005072 -10.647 < 2e-16 ***\n", - "UIncomeM 0.548906 0.281693 1.949 0.0521 . \n", - "UIncomeH 0.708219 0.322028 2.199 0.0284 * \n", - "USYes 0.024181 0.343246 0.070 0.9439 \n", - "Advertising 0.119509 0.024692 4.840 1.87e-06 ***\n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", - "\n", - "Residual standard error: 2.391 on 394 degrees of freedom\n", - "Multiple R-squared: 0.2922,\tAdjusted R-squared: 0.2832 \n", - "F-statistic: 32.53 on 5 and 394 DF, p-value: < 2.2e-16\n", - "\n" - ] - } - ], - "source": [ - "%%R\n", - "summary(M1)" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "56b82d02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(378.690726, 378.69160000000005)" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "378.690726, 19.46**2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "872f645c-1d6f-4d08-9eec-2b80276bc82c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb" - }, - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/transforms/PCA.ipynb b/docs/source/transforms/PCA.ipynb index 224992b..ec1e0ae 100644 --- a/docs/source/transforms/PCA.ipynb +++ b/docs/source/transforms/PCA.ipynb @@ -19,9 +19,14 @@ "outputs": [], "source": [ "import numpy as np\n", + "from sklearn.decomposition import PCA\n", + "\n", "from ISLP import load_data\n", - "from ISLP.models import ModelSpec, pca, Variable, derived_variable\n", - "from sklearn.decomposition import PCA" + "from ISLP.models import (ModelSpec, \n", + " pca, \n", + " Feature, \n", + " derived_feature,\n", + " build_columns)" ] }, { @@ -71,7 +76,7 @@ "id": "fff603bf", "metadata": {}, "source": [ - "Suppose we want to make a `Variable` representing the first 3 principal components of the\n", + "Suppose we want to make a `Feature` representing the first 3 principal components of the\n", " features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`." ] }, @@ -80,8 +85,8 @@ "id": "eab49ad1-3957-478f-8a76-28a8f58551e9", "metadata": {}, "source": [ - "We first make a `Variable` that represents these five features columns, then `pca`\n", - "can be used to compute a new `Variable` that returns the first three principal components." + "We first make a `Feature` that represents these five features columns, then `pca`\n", + "can be used to compute a new `Feature` that returns the first three principal components." ] }, { @@ -91,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n", + "grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n", "sklearn_pca = PCA(n_components=3, whiten=True)" ] }, @@ -100,7 +105,7 @@ "id": "b45655a3-393d-4b4c-b754-cda61ed0e014", "metadata": {}, "source": [ - "We can now fit `sklearn_pca` and create our new variable." + "We can now fit `sklearn_pca` and create our new feature." ] }, { @@ -108,175 +113,18 @@ "execution_count": 5, "id": "6cfe8861-ad07-47b9-95d1-5d5513ff6fbe", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "sklearn_pca.fit(design.build_columns(Carseats, grouped)[0]) \n", - "pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n", + "grouped_features = build_columns(design.column_info_,\n", + " Carseats,\n", + " grouped)[0]\n", + "sklearn_pca.fit(grouped_features) \n", + "pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n", " name='pca(grouped)', encoder=sklearn_pca)\n", - "derived_features, _ = design.build_columns(Carseats, pca_var)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "aeb47184-9e15-4a6e-b60a-916f5ff89063", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CompPriceIncomeAdvertisingPopulationPrice
01387311276120
1111481626083
2113351026980
3117100446697
4141643340128
..................
39513810817203128
39613923337120
3971622612368159
39810079728495
39913437027120
\n", - "

400 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " CompPrice Income Advertising Population Price\n", - "0 138 73 11 276 120\n", - "1 111 48 16 260 83\n", - "2 113 35 10 269 80\n", - "3 117 100 4 466 97\n", - "4 141 64 3 340 128\n", - ".. ... ... ... ... ...\n", - "395 138 108 17 203 128\n", - "396 139 23 3 37 120\n", - "397 162 26 12 368 159\n", - "398 100 79 7 284 95\n", - "399 134 37 0 27 120\n", - "\n", - "[400 rows x 5 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.build_columns(Carseats, grouped)[0]" + "derived_features, _ = build_columns(design.column_info_,\n", + " Carseats, \n", + " pca_var,\n", + " encoders=design.encoders_)" ] }, { @@ -291,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "9f4b0955", "metadata": {}, "outputs": [], @@ -304,22 +152,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6b382699-eb86-457f-8e91-09a63eb21d49", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n", - "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ @@ -329,7 +165,7 @@ " dtype='object')" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -350,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "4a8d9b28", "metadata": {}, "outputs": [], @@ -361,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "6efa6c67-86e1-4f51-86c2-25c838a90bf4", "metadata": {}, "outputs": [ @@ -371,7 +207,7 @@ "(4.073428490498941e-14, 0.0)" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/source/transforms/poly.ipynb b/docs/source/transforms/poly.ipynb index c2b740b..45c862e 100644 --- a/docs/source/transforms/poly.ipynb +++ b/docs/source/transforms/poly.ipynb @@ -168,7 +168,7 @@ "source": [ "## Underlying model\n", "\n", - "If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns\n", + "If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns\n", "in a design matrix when it is a term used in creating the `ModelSpec`.\n", "\n", "Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column\n", diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 3b4fd24..0000000 --- a/environment.yml +++ /dev/null @@ -1,240 +0,0 @@ -name: islp_test -channels: - - defaults -dependencies: - - ca-certificates=2022.07.19=hca03da5_0 - - certifi=2022.9.14=py39hca03da5_0 - - libcxx=14.0.6=h848a8c0_0 - - libffi=3.4.2=hc377ac9_4 - - ncurses=6.3=h1a28f6b_3 - - openssl=1.1.1q=h1a28f6b_0 - - python=3.9.13=hbdb9e5c_1 - - readline=8.1.2=h1a28f6b_1 - - sqlite=3.39.2=h1058600_0 - - tk=8.6.12=hb8d0fd4_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h1a28f6b_1 - - zlib=1.2.12=h5a0b063_3 - - pip: - - absl-py==1.2.0 - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - alabaster==0.7.12 - - ansiwrap==0.8.4 - - anyio==3.6.1 - - appnope==0.1.3 - - argon2-cffi==21.3.0 - - argon2-cffi-bindings==21.2.0 - - astor==0.8.1 - - asttokens==2.0.8 - - astunparse==1.6.3 - - async-timeout==4.0.2 - - attrs==22.1.0 - - autograd==1.4 - - autograd-gamma==0.5.0 - - babel==2.10.3 - - backcall==0.2.0 - - beautifulsoup4==4.11.1 - - bleach==5.0.1 - - build==0.8.0 - - cachetools==4.2.4 - - cffi==1.15.1 - - charset-normalizer==2.1.1 - - click==8.1.3 - - commonmark==0.9.1 - - contourpy==1.0.5 - - cycler==0.11.0 - - debugpy==1.6.3 - - decorator==5.1.1 - - defusedxml==0.7.1 - - docutils==0.17.1 - - entrypoints==0.4 - - exceptiongroup==1.1.0 - - executing==1.0.0 - - fastjsonschema==2.16.2 - - flatbuffers==2.0.7 - - fonttools==4.37.2 - - formulaic==0.5.2 - - frozenlist==1.3.1 - - fsspec==2022.8.2 - - future==0.18.2 - - gast==0.4.0 - - google-auth==1.35.0 - - google-auth-oauthlib==0.4.6 - - google-pasta==0.2.0 - - grpcio==1.48.1 - - h5py==3.7.0 - - html2text==2020.1.16 - - idna==3.4 - - imagesize==1.4.1 - - importlib-metadata==4.12.0 - - iniconfig==2.0.0 - - interface-meta==1.3.0 - - ipykernel==6.15.3 - - ipython==8.5.0 - - ipython-genutils==0.2.0 - - ipywidgets==8.0.2 - - jaraco-classes==3.2.2 - - jedi==0.18.1 - - jinja2==3.1.2 - - joblib==1.2.0 - - json5==0.9.10 - - jsonschema==4.16.0 - - jupyter==1.0.0 - - jupyter-cache==0.5.0 - - jupyter-client==7.3.5 - - jupyter-console==6.4.4 - - jupyter-core==4.11.1 - - jupyter-server==1.18.1 - - jupyterlab==3.4.7 - - jupyterlab-pygments==0.2.2 - - jupyterlab-server==2.15.1 - - jupyterlab-widgets==3.0.3 - - jupytext==1.14.5 - - keras==2.10.0 - - keras-preprocessing==1.1.2 - - keyring==23.9.3 - - kiwisolver==1.4.4 - - l0bnb==1.0.0 - - libclang==14.0.6 - - lifelines==0.27.2 - - llvmlite==0.39.1 - - lxml==4.9.1 - - markdown==3.4.1 - - markdown-it-py==2.1.0 - - markupsafe==2.1.1 - - matplotlib==3.6.0 - - matplotlib-inline==0.1.6 - - mdit-py-plugins==0.3.0 - - mdurl==0.1.2 - - mistune==2.0.4 - - more-itertools==8.14.0 - - multidict==6.0.2 - - myst==1.0.4 - - myst-nb==0.16.0 - - myst-parser==0.18.0 - - nbclassic==0.4.3 - - nbclient==0.5.13 - - nbconvert==7.0.0 - - nbformat==5.5.0 - - nbsphinx==0.8.11 - - nest-asyncio==1.5.5 - - notebook==6.4.12 - - notebook-shim==0.1.0 - - numba==0.56.2 - - numpy==1.23.3 - - numpydoc==1.4.0 - - oauthlib==3.2.1 - - opt-einsum==3.3.0 - - packaging==21.3 - - pandas==1.5.0 - - pandocfilters==1.5.0 - - papermill==2.4.0 - - parso==0.8.3 - - patsy==0.5.2 - - pep517==0.13.0 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.2.0 - - pip==22.2.2 - - pkginfo==1.8.3 - - pluggy==1.0.0 - - portalocker==2.5.1 - - progressbar2==4.0.0 - - prometheus-client==0.14.1 - - prompt-toolkit==3.0.31 - - protobuf==3.19.5 - - psutil==5.9.2 - - ptyprocess==0.7.0 - - pure-eval==0.2.2 - - pyasn1==0.4.8 - - pyasn1-modules==0.2.8 - - pycparser==2.21 - - pydash==5.1.0 - - pydeprecate==0.3.2 - - pygam==0.8.0 - - pygments==2.13.0 - - pyparsing==3.0.9 - - pyrsistent==0.18.1 - - pytest==7.2.0 - - python-dateutil==2.8.2 - - python-utils==3.3.3 - - pytorch-lightning==1.7.6 - - pytz==2022.2.1 - - pytz-deprecation-shim==0.1.0.post0 - - pyyaml==6.0 - - pyzmq==24.0.0 - - qtconsole==5.3.2 - - qtpy==2.2.0 - - readme-renderer==37.1 - - requests==2.28.1 - - requests-oauthlib==1.3.1 - - requests-toolbelt==0.9.1 - - rfc3986==2.0.0 - - rich==12.5.1 - - rpy2==3.5.7 - - rsa==4.9 - - scikit-learn==1.1.2 - - scipy==1.9.1 - - send2trash==1.8.0 - - setuptools==59.8.0 - - six==1.16.0 - - sniffio==1.3.0 - - snowballstemmer==2.2.0 - - soupsieve==2.3.2.post1 - - sphinx==5.1.1 - - sphinx-markdown-builder==0.5.4 - - sphinx-rst-builder==0.0.3 - - sphinx-rtd-theme==1.1.1 - - sphinx-togglebutton==0.3.2 - - sphinxcontrib-applehelp==1.0.2 - - sphinxcontrib-devhelp==1.0.2 - - sphinxcontrib-htmlhelp==2.0.0 - - sphinxcontrib-jsmath==1.0.1 - - sphinxcontrib-qthelp==1.0.3 - - sphinxcontrib-serializinghtml==1.1.5 - - sqlalchemy==1.4.41 - - stack-data==0.5.0 - - statsmodels==0.13.2 - - tabulate==0.8.10 - - tenacity==6.3.1 - - tensorboard==2.10.0 - - tensorboard-data-server==0.6.1 - - tensorboard-plugin-wit==1.8.1 - - tensorflow-estimator==2.10.0 - - tensorflow-macos==2.10.0 - - tensorflow-metal==0.6.0 - - termcolor==2.0.1 - - terminado==0.15.0 - - texext==0.6.7 - - textwrap3==0.9.2 - - threadpoolctl==3.1.0 - - tinycss2==1.1.1 - - toml==0.10.2 - - tomli==2.0.1 - - torch==1.12.1 - - torchdata==0.4.1 - - torchinfo==1.7.0 - - torchmetrics==0.9.3 - - torchvision==0.13.1 - - tornado==6.2 - - tqdm==4.64.1 - - traitlets==5.4.0 - - twine==4.0.1 - - typing-extensions==4.3.0 - - tzdata==2022.7 - - tzlocal==4.2 - - unidecode==1.3.4 - - unify==0.5 - - untokenize==0.1.1 - - urllib3==1.26.12 - - wcwidth==0.2.5 - - webencodings==0.5.1 - - websocket-client==1.4.1 - - werkzeug==2.2.2 - - widgetsnbextension==4.0.3 - - wrapt==1.14.1 - - yapf==0.32.0 - - yarl==1.8.1 - - zipp==3.8.1 -prefix: /Users/jonathantaylor/miniconda3/envs/islp_test diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5fe63fd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,88 @@ +[project] +name = "ISLP" +dependencies = ["numpy>=1.7.1", + "scipy>=0.9", + "pandas>=1.5", + "lxml", # pandas needs this for html + "scikit-learn>=1.2", + "joblib", + "statsmodels>=0.13", + "lifelines", + "pygam", # for GAM in Ch7 + "torch", + "pytorch_lightning", + "torchmetrics", + ] +description = "Library for ISLP labs" +readme = "README.md" +requires-python = ">=3.10" +license = {file = "LICENSE"} +keywords = [] +authors = [ + {name = "Trevor Hastie", email="hastie@stanford.edu" }, + {name = "Gareth James", email="gareth@emory.edu"}, + {name = "Jonathan Taylor", email="jonathan.taylor@stanford.edu" }, + {name = "Rob Tibshirani", email="tibs@stanford.edu" }, + {name = "Daniela Witten", email="dwitten@uw.edu" }, + ] +maintainers = [ + {name = "Jonathan Taylor", email="jonathan.taylor@stanford.edu" }, + ] +classifiers = ["Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Topic :: Scientific/Engineering" + ] +dynamic = ["version"] + +[tool.setuptools] +packages = [ + "ISLP", + "ISLP.models", + "ISLP.bart", + "ISLP.torch", + "ISLP.data" +] +include-package-data = true + +[tool.setuptools.package-data] +ISLP = ["data/*.csv", "data/*.npy", "data/*.data"] + +[tool.setuptools.dynamic] +version = {attr = "ISLP.__version__"} # Assuming ISLP.__version__ holds your version + + +[project.urls] # Optional +"Homepage" = "https://github.com/intro-stat-learning/ISLP" +"Bug Reports" = "https://github.com/intro-stat-learning/ISLP/issues" +"Funding" = "https://donate.pypi.org" +"Say Thanks!" = "http://saythanks.io/to/example" +"Source" = "https://github.com/pypa/sampleproject/" + +[project.optional-dependencies] +doc = ['Sphinx>=3.0'] + +[build-system] +requires = ["setuptools>=42", + "wheel", + "Sphinx>=1.0", + "numpy", + "pandas", + "scipy", + "scikit-learn", + "joblib", + "statsmodels", + "versioneer[toml]" + ] +build-backend = "setuptools.build_meta" + +[tool.versioneer] +VCS = "git" +style = "pep440" +versionfile_source = "ISLP/_version.py" +versionfile_build = "ISLP/_version.py" +tag_prefix = "v" +parentdir_prefix = "ISLP-" diff --git a/requirements.txt b/requirements.txt index bf393e1..10bff6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ numpy>=1.7.1 scipy>=0.9 -jupyter pandas>=0.20 +pandas<=1.9 lxml # pandas needs this for html -scikit-learn>=1.0 +scikit-learn>=1.2 joblib statsmodels>=0.13 lifelines -#l0bnb # for bestsubsets -#pygam # for GAM in Ch7 +pygam # for GAM in Ch7 +torch +pytorch_lightning +torchmetrics diff --git a/setup.cfg b/setup.cfg index 14d7ccd..c59c035 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,3 @@ -[versioneer] -VCS = git -style = pep440 -versionfile_source = ISLP/_version.py -tag_prefix = -parentdir_prefix = ISLP- + +[metadata] +license_files = LICENSE.txt \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100755 index 95fca7d..0000000 --- a/setup.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/env python -''' Installation script for ISLP package ''' - -import os -import sys -from os.path import join as pjoin, dirname, exists -from distutils.version import LooseVersion -# BEFORE importing distutils, remove MANIFEST. distutils doesn't properly -# update it when the contents of directories change. -if exists('MANIFEST'): os.remove('MANIFEST') - -# Unconditionally require setuptools -import setuptools - -# Package for getting versions from git tags -import versioneer - -# Import distutils _after_ setuptools import, and after removing -# MANIFEST -from distutils.core import setup -from distutils.extension import Extension - -# Get various parameters for this version, stored in ISLP/info.py - -class Bunch(object): - def __init__(self, vars): - for key, name in vars.items(): - if key.startswith('__'): - continue - self.__dict__[key] = name - -def read_vars_from(ver_file): - """ Read variables from Python text file - - Parameters - ---------- - ver_file : str - Filename of file to read - - Returns - ------- - info_vars : Bunch instance - Bunch object where variables read from `ver_file` appear as - attributes - """ - # Use exec for compabibility with Python 3 - ns = {} - with open(ver_file, 'rt') as fobj: - exec(fobj.read(), ns) - return Bunch(ns) - -info = read_vars_from(pjoin('ISLP', 'info.py')) - -class SetupDependency(object): - """ SetupDependency class - - Parameters - ---------- - import_name : str - Name with which required package should be ``import``ed. - min_ver : str - Distutils version string giving minimum version for package. - req_type : {'install_requires', 'setup_requires'}, optional - Setuptools dependency type. - heavy : {False, True}, optional - If True, and package is already installed (importable), then do not add - to the setuptools dependency lists. This prevents setuptools - reinstalling big packages when the package was installed without using - setuptools, or this is an upgrade, and we want to avoid the pip default - behavior of upgrading all dependencies. - install_name : str, optional - Name identifying package to install from pypi etc, if different from - `import_name`. - """ - - def __init__(self, import_name, - min_ver, - req_type='install_requires', - heavy=False, - install_name=None): - self.import_name = import_name - self.min_ver = min_ver - self.req_type = req_type - self.heavy = heavy - self.install_name = (import_name if install_name is None - else install_name) - - def check_fill(self, setuptools_kwargs): - """ Process this dependency, maybe filling `setuptools_kwargs` - - Run checks on this dependency. If not using setuptools, then raise - error for unmet dependencies. If using setuptools, add missing or - not-heavy dependencies to `setuptools_kwargs`. - - A heavy dependency is one that is inconvenient to install - automatically, such as numpy or (particularly) scipy, matplotlib. - - Parameters - ---------- - setuptools_kwargs : dict - Dictionary of setuptools keyword arguments that may be modified - in-place while checking dependencies. - """ - found_ver = get_pkg_version(self.import_name) - ver_err_msg = version_error_msg(self.import_name, - found_ver, - self.min_ver) - if not 'setuptools' in sys.modules: - # Not using setuptools; raise error for any unmet dependencies - if ver_err_msg is not None: - raise RuntimeError(ver_err_msg) - return - # Using setuptools; add packages to given section of - # setup/install_requires, unless it's a heavy dependency for which we - # already have an acceptable importable version. - if self.heavy and ver_err_msg is None: - return - new_req = '{0}>={1}'.format(self.import_name, self.min_ver) - old_reqs = setuptools_kwargs.get(self.req_type, []) - setuptools_kwargs[self.req_type] = old_reqs + [new_req] - -def get_pkg_version(pkg_name): - """ Return package version for `pkg_name` if installed - - Returns - ------- - pkg_version : str or None - Return None if package not importable. Return 'unknown' if standard - ``__version__`` string not present. Otherwise return version string. - """ - try: - pkg = __import__(pkg_name) - except ImportError: - return None - try: - return pkg.__version__ - except AttributeError: - return 'unknown' - -def version_error_msg(pkg_name, found_ver, min_ver): - """ Return informative error message for version or None - """ - if found_ver is None: - return 'We need package {0}, but not importable'.format(pkg_name) - if found_ver == 'unknown': - return 'We need {0} version {1}, but cannot get version'.format( - pkg_name, min_ver) - if LooseVersion(found_ver) >= LooseVersion(min_ver): - return None - return 'We need {0} version {1}, but found version {2}'.format( - pkg_name, found_ver, min_ver) - - - -# Try to preempt setuptools monkeypatching of Extension handling when Pyrex -# is missing. Otherwise the monkeypatched Extension will change .pyx -# filenames to .c filenames, and we probably don't have the .c files. -sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex')) -# Set setuptools extra arguments -extra_setuptools_args = dict( - tests_require=['nose'], - test_suite='nose.collector', - zip_safe=False, - extras_require = dict( - doc=['Sphinx>=1.0'], - test=['nose>=0.10.1'])) - -# Define extensions -EXTS = [] - -SetupDependency('numpy', info.NUMPY_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) -SetupDependency('scipy', info.SCIPY_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) -SetupDependency('matplotlib', info.MATPLOTLIB_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) -SetupDependency('pandas', info.PANDAS_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) -SetupDependency('statsmodels', info.STATSMODELS_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) -SetupDependency('scikit-learn', info.SKLEARN_MIN_VERSION, - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) - -#requirements = open('requirements.txt').read().strip().split('\n') - -requirements = '''numpy -scipy -jupyter -pandas -lxml # pandas needs this for html -scikit-learn -joblib -pygam # for GAM in Ch7 -lifelines'''.split('\n') -#l0bnb # for bestsubsets - - - -for req in requirements: - req = req.split('#')[0] - import sys; sys.stderr.write(req+'\n') - SetupDependency(req, "0.0", - req_type='install_requires', - heavy=True).check_fill(extra_setuptools_args) - -cmdclass=versioneer.get_cmdclass() - -# get long_description - -if sys.version_info[0] > 2: - long_description = open('README.md', 'rt', encoding='utf-8').read() -else: - long_description = unicode(file('README.md').read(), 'utf-8') - -def main(**extra_args): - setup(name=info.NAME, - maintainer=info.MAINTAINER, - maintainer_email=info.MAINTAINER_EMAIL, - description=info.DESCRIPTION, - url=info.URL, - download_url=info.DOWNLOAD_URL, - license=info.LICENSE, - classifiers=info.CLASSIFIERS, - author=info.AUTHOR, - author_email=info.AUTHOR_EMAIL, - platforms=info.PLATFORMS, - version=versioneer.get_version(), - requires=info.REQUIRES, - provides=info.PROVIDES, - packages = ['ISLP', - 'ISLP.models', - 'ISLP.bart', - 'ISLP.torch' - ], - ext_modules = EXTS, - package_data = {"ISLP":["data/*csv", "data/*npy", "data/*data"]}, - include_package_data=True, - data_files=[], - scripts=[], - long_description=long_description, - cmdclass = cmdclass, - **extra_args - ) - -#simple way to test what setup will do -#python setup.py install --prefix=/tmp -if __name__ == "__main__": - main(**extra_setuptools_args) diff --git a/ISLP/bart/tests/test_bart.py b/tests/bart/test_bart.py similarity index 96% rename from ISLP/bart/tests/test_bart.py rename to tests/bart/test_bart.py index d12a0a2..903bb83 100644 --- a/ISLP/bart/tests/test_bart.py +++ b/tests/bart/test_bart.py @@ -19,8 +19,6 @@ def test_bart(): clone(B) - return B - if __name__ == "__main__": test_bart() diff --git a/ISLP/models/tests/__init__.py b/tests/deeplearning/__init__.py similarity index 100% rename from ISLP/models/tests/__init__.py rename to tests/deeplearning/__init__.py diff --git a/tests/deeplearning/test_hitters.py b/tests/deeplearning/test_hitters.py new file mode 100644 index 0000000..bf609b9 --- /dev/null +++ b/tests/deeplearning/test_hitters.py @@ -0,0 +1,481 @@ +import numpy as np +import pandas as pd +from matplotlib.pyplot import subplots +from sklearn.linear_model import \ + (LinearRegression, + Lasso) +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import KFold +from sklearn.pipeline import Pipeline +from ISLP import load_data +from ISLP.models import ModelSpec as MS +from sklearn.model_selection import \ + (train_test_split, + GridSearchCV) + +# torch + +import torch +from torch import nn +from torch.utils.data import TensorDataset + +# torch helpers + +from torchmetrics import MeanAbsoluteError +from torchinfo import summary + +# pytorch lightning + +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import CSVLogger + +# setting seed + +from pytorch_lightning import seed_everything +seed_everything(0, workers=True) +torch.use_deterministic_algorithms(True, warn_only=True) + +# ISLP.torch + +from ISLP.torch import (SimpleDataModule, + SimpleModule, + ErrorTracker, + rec_num_workers) + + +def test_hitters(max_epochs=2, + num_lam=5): + + Hitters = load_data('Hitters').dropna() + n = Hitters.shape[0] + + # We will fit two linear models (least squares and lasso) and compare their performance + # to that of a neural network. For this comparison we will use mean absolute error on a validation dataset. + # \begin{equation*} + # \begin{split} + # \mbox{MAE}(y,\hat{y}) = \frac{1}{n} \sum_{i=1}^n |y_i-\hat{y}_i|. + # \end{split} + # \end{equation*} + # We set up the model matrix and the response. + + # In[11]: + + + model = MS(Hitters.columns.drop('Salary'), intercept=False) + X = model.fit_transform(Hitters).to_numpy() + Y = Hitters['Salary'].to_numpy() + + + # The `to_numpy()` method above converts `pandas` + # data frames or series to `numpy` arrays. + # We do this because we will need to use `sklearn` to fit the lasso model, + # and it requires this conversion. + # We also use a linear regression method from `sklearn`, rather than the method + # in Chapter~3 from `statsmodels`, to facilitate the comparisons. + + # We now split the data into test and training, fixing the random + # state used by `sklearn` to do the split. + + # In[12]: + + + (X_train, + X_test, + Y_train, + Y_test) = train_test_split(X, + Y, + test_size=1/3, + random_state=1) + + + # ### Linear Models + # We fit the linear model and evaluate the test error directly. + + # In[13]: + + + hit_lm = LinearRegression().fit(X_train, Y_train) + Yhat_test = hit_lm.predict(X_test) + np.abs(Yhat_test - Y_test).mean() + + + # Next we fit the lasso using `sklearn`. We are using + # mean absolute error to select and evaluate a model, rather than mean squared error. + # The specialized solver we used in Section 6.5.2 uses only mean squared error. So here, with a bit more work, we create a cross-validation grid and perform the cross-validation directly. + # + # We encode a pipeline with two steps: we first normalize the features using a `StandardScaler()` transform, + # and then fit the lasso without further normalization. + + # In[14]: + + + scaler = StandardScaler(with_mean=True, with_std=True) + lasso = Lasso(warm_start=True, max_iter=30000) + standard_lasso = Pipeline(steps=[('scaler', scaler), + ('lasso', lasso)]) + + + # We need to create a grid of values for $\lambda$. As is common practice, + # we choose a grid of 100 values of $\lambda$, uniform on the log scale from `lam_max` down to `0.01*lam_max`. Here `lam_max` is the smallest value of + # $\lambda$ with an all-zero solution. This value equals the largest absolute inner-product between any predictor and the (centered) response. {The derivation of this result is beyond the scope of this book.} + + # In[15]: + + + X_s = scaler.fit_transform(X_train) + n = X_s.shape[0] + lam_max = np.fabs(X_s.T.dot(Y_train - Y_train.mean())).max() / n + param_grid = {'alpha': np.exp(np.linspace(0, np.log(0.01), num_lam)) + * lam_max} + + + # Note that we had to transform the data first, since the scale of the variables impacts the choice of $\lambda$. + # We now perform cross-validation using this sequence of $\lambda$ values. + + # In[16]: + + + cv = KFold(10, + shuffle=True, + random_state=1) + grid = GridSearchCV(lasso, + param_grid, + cv=cv, + scoring='neg_mean_absolute_error') + grid.fit(X_train, Y_train); + + + # We extract the lasso model with best cross-validated mean absolute error, and evaluate its + # performance on `X_test` and `Y_test`, which were not used in + # cross-validation. + + # In[17]: + + + trained_lasso = grid.best_estimator_ + Yhat_test = trained_lasso.predict(X_test) + np.fabs(Yhat_test - Y_test).mean() + + + # This is similar to the results we got for the linear model fit by least squares. However, these results can vary a lot for different train/test splits; we encourage the reader to try a different seed in code block 12 and rerun the subsequent code up to this point. + # + # ### Specifying a Network: Classes and Inheritance + # To fit the neural network, we first set up a model structure + # that describes the network. + # Doing so requires us to define new classes specific to the model we wish to fit. + # Typically this is done in `pytorch` by sub-classing a generic + # representation of a network, which is the approach we take here. + # Although this example is simple, we will go through the steps in some detail, since it will serve us well + # for the more complex examples to follow. + + # In[18]: + + + class HittersModel(nn.Module): + + def __init__(self, input_size): + super(HittersModel, self).__init__() + self.flatten = nn.Flatten() + self.sequential = nn.Sequential( + nn.Linear(input_size, 50), + nn.ReLU(), + nn.Dropout(0.4), + nn.Linear(50, 1)) + + def forward(self, x): + x = self.flatten(x) + return torch.flatten(self.sequential(x)) + + + # The `class` statement identifies the code chunk as a + # declaration for a class `HittersModel` + # that inherits from the base class `nn.Module`. This base + # class is ubiquitous in `torch` and represents the + # mappings in the neural networks. + # + # Indented beneath the `class` statement are the methods of this class: + # in this case `__init__` and `forward`. The `__init__` method is + # called when an instance of the class is created as in the cell + # below. In the methods, `self` always refers to an instance of the + # class. In the `__init__` method, we have attached two objects to + # `self` as attributes: `flatten` and `sequential`. These are used in + # the `forward` method to describe the map that this module implements. + # + # There is one additional line in the `__init__` method, which + # is a call to + # `super()`. This function allows subclasses (i.e. `HittersModel`) + # to access methods of the class they inherit from. For example, + # the class `nn.Module` has its own `__init__` method, which is different from + # the `HittersModel.__init__()` method we’ve written above. + # Using `super()` allows us to call the method of the base class. For + # `torch` models, we will always be making this `super()` call as it is necessary + # for the model to be properly interpreted by `torch`. + # + # The object `nn.Module` has more methods than simply `__init__` and `forward`. These + # methods are directly accessible to `HittersModel` instances because of this inheritance. + # One such method we will see shortly is the `eval()` method, used + # to disable dropout for when we want to evaluate the model on test data. + + # In[19]: + + + hit_model = HittersModel(X.shape[1]) + + + # The object `self.sequential` is a composition of four maps. The + # first maps the 19 features of `Hitters` to 50 dimensions, introducing $50\times 19+50$ parameters + # for the weights and *intercept* of the map (often called the *bias*). This layer + # is then mapped to a ReLU layer followed by a 40% dropout layer, and finally a + # linear map down to 1 dimension, again with a bias. The total number of + # trainable parameters is therefore $50\times 19+50+50+1=1051$. + + # The package `torchinfo` provides a `summary()` function that neatly summarizes + # this information. We specify the size of the input and see the size + # of each tensor as it passes through layers of the network. + + # In[20]: + + + summary(hit_model, + input_size=X_train.shape, + col_names=['input_size', + 'output_size', + 'num_params']) + + + # We have truncated the end of the output slightly, here and in subsequent uses. + # + # We now need to transform our training data into a form accessible to `torch`. + # The basic + # datatype in `torch` is a `tensor`, which is very similar + # to an `ndarray` from early chapters. + # We also note here that `torch` typically + # works with 32-bit (*single precision*) + # rather than 64-bit (*double precision*) floating point numbers. + # We therefore convert our data to `np.float32` before + # forming the tensor. + # The $X$ and $Y$ tensors are then arranged into a `Dataset` + # recognized by `torch` + # using `TensorDataset()`. + + # In[21]: + + + X_train_t = torch.tensor(X_train.astype(np.float32)) + Y_train_t = torch.tensor(Y_train.astype(np.float32)) + hit_train = TensorDataset(X_train_t, Y_train_t) + + + # We do the same for the test data. + + # In[22]: + + + X_test_t = torch.tensor(X_test.astype(np.float32)) + Y_test_t = torch.tensor(Y_test.astype(np.float32)) + hit_test = TensorDataset(X_test_t, Y_test_t) + + + # Finally, this dataset is passed to a `DataLoader()` which ultimately + # passes data into our network. While this may seem + # like a lot of overhead, this structure is helpful for more + # complex tasks where data may live on different machines, + # or where data must be passed to a GPU. + # We provide a helper function `SimpleDataModule()` in `ISLP` to make this task easier for + # standard usage. + # One of its arguments is `num_workers`, which indicates + # how many processes we will use + # for loading the data. For small + # data like `Hitters` this will have little effect, but + # it does provide an advantage for the `MNIST` and `CIFAR100` examples below. + # The `torch` package will inspect the process running and determine a + # maximum number of workers. {This depends on the computing hardware and the number of cores available.} We’ve included a function + # `rec_num_workers()` to compute this so we know how many + # workers might be reasonable (here the max was 16). + + # In[23]: + + + max_num_workers = rec_num_workers() + + + # The general training setup in `pytorch_lightning` involves + # training, validation and test data. These are each + # represented by different data loaders. During each epoch, + # we run a training step to learn the model and a validation + # step to track the error. The test data is typically + # used at the end of training to evaluate the model. + # + # In this case, as we had split only into test and training, + # we’ll use the test data as validation data with the + # argument `validation=hit_test`. The + # `validation` argument can be a float between 0 and 1, an + # integer, or a + # `Dataset`. If a float (respectively, integer), it is interpreted + # as a percentage (respectively number) of the *training* observations to be used for validation. + # If it is a `Dataset`, it is passed directly to a data loader. + + # In[24]: + + + hit_dm = SimpleDataModule(hit_train, + hit_test, + batch_size=32, + num_workers=min(4, max_num_workers), + validation=hit_test) + + + # Next we must provide a `pytorch_lightning` module that controls + # the steps performed during the training process. We provide methods for our + # `SimpleModule()` that simply record the value + # of the loss function and any additional + # metrics at the end of each epoch. These operations + # are controlled by the methods `SimpleModule.[training/test/validation]_step()`, though + # we will not be modifying these in our examples. + + # In[25]: + + + hit_module = SimpleModule.regression(hit_model, + metrics={'mae':MeanAbsoluteError()}) + + + # By using the `SimpleModule.regression()` method, we indicate that we will use squared-error loss as in + # (10.23). + # We have also asked for mean absolute error to be tracked as well + # in the metrics that are logged. + # + # We log our results via `CSVLogger()`, which in this case stores the results in a CSV file within a directory `logs/hitters`. After the fitting is complete, this allows us to load the + # results as a `pd.DataFrame()` and visualize them below. There are + # several ways to log the results within `pytorch_lightning`, though + # we will not cover those here in detail. + + # In[26]: + + + hit_logger = CSVLogger('logs', name='hitters') + + + # Finally we are ready to train our model and log the results. We + # use the `Trainer()` object from `pytorch_lightning` + # to do this work. The argument `datamodule=hit_dm` tells the trainer + # how training/validation/test logs are produced, + # while the first argument `hit_module` + # specifies the network architecture + # as well as the training/validation/test steps. + # The `callbacks` argument allows for + # several tasks to be carried out at various + # points while training a model. Here + # our `ErrorTracker()` callback will enable + # us to compute validation error while training + # and, finally, the test error. + # We now fit the model for 50 epochs. + + # In[27]: + + + hit_trainer = Trainer(deterministic=True, + max_epochs=max_epochs, + log_every_n_steps=5, + logger=hit_logger, + callbacks=[ErrorTracker()]) + hit_trainer.fit(hit_module, datamodule=hit_dm) + + + # At each step of SGD, the algorithm randomly selects 32 training observations for + # the computation of the gradient. Recall from Section 10.7 + # that an epoch amounts to the number of SGD steps required to process $n$ + # observations. Since the training set has + # $n=175$, and we specified a `batch_size` of 32 in the construction of `hit_dm`, an epoch is $175/32=5.5$ SGD steps. + # + # After having fit the model, we can evaluate performance on our test + # data using the `test()` method of our trainer. + + # In[28]: + + + hit_trainer.test(hit_module, datamodule=hit_dm) + + + # The results of the fit have been logged into a CSV file. We can find the + # results specific to this run in the `experiment.metrics_file_path` + # attribute of our logger. Note that each time the model is fit, the logger will output + # results into a new subdirectory of our directory `logs/hitters`. + # + # We now create a plot of the MAE (mean absolute error) as a function of + # the number of epochs. + # First we retrieve the logged summaries. + + # In[29]: + + + hit_results = pd.read_csv(hit_logger.experiment.metrics_file_path) + + + # Since we will produce similar plots in later examples, we write a + # simple generic function to produce this plot. + + # In[30]: + + + def summary_plot(results, + ax, + col='loss', + valid_legend='Validation', + training_legend='Training', + ylabel='Loss', + fontsize=20): + for (column, + color, + label) in zip([f'train_{col}_epoch', + f'valid_{col}'], + ['black', + 'red'], + [training_legend, + valid_legend]): + results.plot(x='epoch', + y=column, + label=label, + marker='o', + color=color, + ax=ax) + ax.set_xlabel('Epoch') + ax.set_ylabel(ylabel) + return ax + + + # We now set up our axes, and use our function to produce the MAE plot. + + # In[31]: + + + fig, ax = subplots(1, 1, figsize=(6, 6)) + ax = summary_plot(hit_results, + ax, + col='mae', + ylabel='MAE', + valid_legend='Validation (=Test)') + ax.set_ylim([0, 400]) + ax.set_xticks(np.linspace(0, 50, 11).astype(int)); + + + # We can predict directly from the final model, and + # evaluate its performance on the test data. + # Before fitting, we call the `eval()` method + # of `hit_model`. + # This tells + # `torch` to effectively consider this model to be fitted, so that + # we can use it to predict on new data. For our model here, + # the biggest change is that the dropout layers will + # be turned off, i.e. no weights will be randomly + # dropped in predicting on new data. + + # In[32]: + + + hit_model.eval() + preds = hit_module(X_test_t) + torch.abs(Y_test_t - preds).mean() + + + diff --git a/tests/deeplearning/test_mnist.py b/tests/deeplearning/test_mnist.py new file mode 100644 index 0000000..c6d39d9 --- /dev/null +++ b/tests/deeplearning/test_mnist.py @@ -0,0 +1,258 @@ + +# torch + +import torch +from torch import nn + +# torch helpers + +from torchinfo import summary + +# pytorch lightning + +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import CSVLogger + +# setting seed + +from pytorch_lightning import seed_everything +seed_everything(0, workers=True) +torch.use_deterministic_algorithms(True, warn_only=True) + +# ISLP.torch + +from ISLP.torch import (SimpleDataModule, + SimpleModule, + ErrorTracker) + +from torchvision.datasets import MNIST +from torchvision.transforms import ToTensor + +def test_mnist(max_epochs=2): + + + # ## Multilayer Network on the MNIST Digit Data + # The `torchvision` package comes with a number of example datasets, + # including the `MNIST` digit data. Our first step is to retrieve + # the training and test data sets; the `MNIST()` function within + # `torchvision.datasets` is provided for this purpose. The + # data will be downloaded the first time this function is executed, and stored in the directory `data/MNIST`. + + # In[34]: + + + (mnist_train, + mnist_test) = [MNIST(root='data', + train=train, + download=True, + transform=ToTensor()) + for train in [True, False]] + mnist_train + + + # There are 60,000 images in the training data and 10,000 in the test + # data. The images are $28\times 28$, and stored as a matrix of pixels. We + # need to transform each one into a vector. + # + # Neural networks are somewhat sensitive to the scale of the inputs, much as ridge and + # lasso regularization are affected by scaling. Here the inputs are eight-bit + # grayscale values between 0 and 255, so we rescale to the unit + # interval. {Note: eight bits means $2^8$, which equals 256. Since the convention + # is to start at $0$, the possible values range from $0$ to $255$.} + # This transformation, along with some reordering + # of the axes, is performed by the `ToTensor()` transform + # from the `torchvision.transforms` package. + # + # As in our `Hitters` example, we form a data module + # from the training and test datasets, setting aside 20% + # of the training images for validation. + + # In[35]: + + + mnist_dm = SimpleDataModule(mnist_train, + mnist_test, + validation=0.2, + num_workers=2, + batch_size=256) + + + # Let’s take a look at the data that will get fed into our network. We loop through the first few + # chunks of the test dataset, breaking after 2 batches: + + # In[36]: + + + for idx, (X_ ,Y_) in enumerate(mnist_dm.train_dataloader()): + print('X: ', X_.shape) + print('Y: ', Y_.shape) + if idx >= 1: + break + + + # We see that the $X$ for each batch consists of 256 images of size `1x28x28`. + # Here the `1` indicates a single channel (greyscale). For RGB images such as `CIFAR100` below, + # we will see that the `1` in the size will be replaced by `3` for the three RGB channels. + # + # Now we are ready to specify our neural network. + + # In[37]: + + + class MNISTModel(nn.Module): + def __init__(self): + super(MNISTModel, self).__init__() + self.layer1 = nn.Sequential( + nn.Flatten(), + nn.Linear(28*28, 256), + nn.ReLU(), + nn.Dropout(0.4)) + self.layer2 = nn.Sequential( + nn.Linear(256, 128), + nn.ReLU(), + nn.Dropout(0.3)) + self._forward = nn.Sequential( + self.layer1, + self.layer2, + nn.Linear(128, 10)) + def forward(self, x): + return self._forward(x) + + + # We see that in the first layer, each `1x28x28` image is flattened, then mapped to + # 256 dimensions where we apply a ReLU activation with 40% dropout. + # A second layer maps the first layer’s output down to + # 128 dimensions, applying a ReLU activation with 30% dropout. Finally, + # the 128 dimensions are mapped down to 10, the number of classes in the + # `MNIST` data. + + # In[38]: + + + mnist_model = MNISTModel() + + + # We can check that the model produces output of expected size based + # on our existing batch `X_` above. + + # In[39]: + + + mnist_model(X_).size() + + + # Let’s take a look at the summary of the model. Instead of an `input_size` we can pass + # a tensor of correct shape. In this case, we pass through the final + # batched `X_` from above. + + # In[40]: + + + summary(mnist_model, + input_data=X_, + col_names=['input_size', + 'output_size', + 'num_params']) + + + # Having set up both the model and the data module, fitting this model is + # now almost identical to the `Hitters` example. In contrast to our regression model, here we will use the + # `SimpleModule.classification()` method which + # uses the cross-entropy loss function instead of mean squared error. It must be supplied with the number of classes in the problem. + + # In[41]: + + + mnist_module = SimpleModule.classification(mnist_model, + num_classes=10) + mnist_logger = CSVLogger('logs', name='MNIST') + + + # Now we are ready to go. The final step is to supply training data, and fit the model. + + # In[42]: + + + mnist_trainer = Trainer(deterministic=True, + max_epochs=max_epochs, + logger=mnist_logger, + callbacks=[ErrorTracker()]) + mnist_trainer.fit(mnist_module, + datamodule=mnist_dm) + + + # We have suppressed the output here, which is a progress report on the + # fitting of the model, grouped by epoch. This is very useful, since on + # large datasets fitting can take time. Fitting this model took 245 + # seconds on a MacBook Pro with an Apple M1 Pro chip with 10 cores and 16 GB of RAM. + # Here we specified a + # validation split of 20%, so training is actually performed on + # 80% of the 60,000 observations in the training set. This is an + # alternative to actually supplying validation data, like we did for the `Hitters` data. + # SGD uses batches + # of 256 observations in computing the gradient, and doing the + # arithmetic, we see that an epoch corresponds to 188 gradient steps. + + # `SimpleModule.classification()` includes + # an accuracy metric by default. Other + # classification metrics can be added from `torchmetrics`. + # We will use our `summary_plot()` function to display + # accuracy across epochs. + + + mnist_trainer.test(mnist_module, + datamodule=mnist_dm) + + + # Table 10.1 also reports the error rates resulting from LDA (Chapter 4) and multiclass logistic + # regression. For LDA we refer the reader to Section 4.7.3. + # Although we could use the `sklearn` function `LogisticRegression()` to fit + # multiclass logistic regression, we are set up here to fit such a model + # with `torch`. + # We just have an input layer and an output layer, and omit the hidden layers! + + # In[45]: + + + class MNIST_MLR(nn.Module): + def __init__(self): + super(MNIST_MLR, self).__init__() + self.linear = nn.Sequential(nn.Flatten(), + nn.Linear(784, 10)) + def forward(self, x): + return self.linear(x) + + mlr_model = MNIST_MLR() + mlr_module = SimpleModule.classification(mlr_model, + num_classes=10) + mlr_logger = CSVLogger('logs', name='MNIST_MLR') + + + # In[46]: + + + mlr_trainer = Trainer(deterministic=True, + max_epochs=30, + callbacks=[ErrorTracker()]) + mlr_trainer.fit(mlr_module, datamodule=mnist_dm) + + + # We fit the model just as before and compute the test results. + + # In[47]: + + + mlr_trainer.test(mlr_module, + datamodule=mnist_dm) + + + # The accuracy is above 90% even for this pretty simple model. + # + # As in the `Hitters` example, we delete some of + # the objects we created above. + + # In[48]: + + + + diff --git a/tests/models/test_boolean_columns.py b/tests/models/test_boolean_columns.py new file mode 100644 index 0000000..7b5a429 --- /dev/null +++ b/tests/models/test_boolean_columns.py @@ -0,0 +1,23 @@ +import pandas as pd +import statsmodels.api as sm +import numpy as np +from itertools import combinations + +from ISLP.models import ModelSpec as MS + +rng = np.random.default_rng(0) + +df = pd.DataFrame({'A':rng.standard_normal(10), + 'B':np.array([1,2,3,2,1,1,1,3,2,1], int), + 'C':np.array([True,False,False,True,True]*2, bool), + 'D':rng.standard_normal(10)}) +Y = rng.standard_normal(10) + +def test_all(): + + for i in range(1, 5): + for comb in combinations(['A','B','C','D'], i): + + X = MS(comb).fit_transform(df) + sm.OLS(Y, X).fit() + diff --git a/ISLP/models/tests/test_columns.py b/tests/models/test_columns.py similarity index 79% rename from ISLP/models/tests/test_columns.py rename to tests/models/test_columns.py index a86941b..77ba784 100644 --- a/ISLP/models/tests/test_columns.py +++ b/tests/models/test_columns.py @@ -3,6 +3,7 @@ from pandas.api.types import CategoricalDtype from ISLP.models.columns import _get_column_info +from ISLP.models.model_spec import Contrast def test_column_info(): @@ -15,5 +16,7 @@ def test_column_info(): print(_get_column_info(df, df.columns, [False]*4+[True], - [False]*5)) + [False]*5, + categorical_encoders={'categorical':Contrast(method='drop')})) + diff --git a/ISLP/models/tests/test_model_matrix.py b/tests/models/test_model_matrix.py similarity index 86% rename from ISLP/models/tests/test_model_matrix.py rename to tests/models/test_model_matrix.py index 51e079c..70b9cab 100644 --- a/ISLP/models/tests/test_model_matrix.py +++ b/tests/models/test_model_matrix.py @@ -2,7 +2,7 @@ from sklearn.base import clone from ISLP.transforms import Poly, NaturalSpline, BSpline, Interaction -from ISLP.models.model_spec import ModelSpec, Variable, ns, bs, poly, pca, contrast, Contrast +from ISLP.models.model_spec import ModelSpec, Feature, ns, bs, poly, pca, contrast, Contrast, build_model from sklearn.preprocessing import (OneHotEncoder, OrdinalEncoder) @@ -37,7 +37,7 @@ def test_ndarray(): X = rng.standard_normal((50,5)) M = ModelSpec(terms=[1, (3,2)], - default_encoders=default_encoders) + categorical_encoders=default_encoders) M.fit(X) MX = M.transform(X) @@ -51,7 +51,7 @@ def test_dataframe1(): D = pd.DataFrame(X, columns=['A','B','C','D','E']) M = ModelSpec(terms=['A','D',('D','E')], - default_encoders=default_encoders) + categorical_encoders=default_encoders) clone(M) MX = np.asarray(M.fit_transform(D)) @@ -66,7 +66,7 @@ def test_dataframe2(): D = pd.DataFrame(X, columns=['V','B','A','D','E']) M = ModelSpec(terms=['A', 'D', 'B', ('D','E'), 'V'], - default_encoders=default_encoders) + categorical_encoders=default_encoders) clone(M) MX = M.fit_transform(D) @@ -83,7 +83,7 @@ def test_dataframe3(): D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=['A', 'E', ('D','E')], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = np.asarray(M.fit_transform(D)) M2 = clone(M) @@ -105,7 +105,7 @@ def test_dataframe4(): D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=['A', 'E', ('D','E'), 'D'], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = np.asarray(M.fit_transform(D)) DE = pd.get_dummies(D['E']) @@ -119,7 +119,6 @@ def test_dataframe4(): np.testing.assert_allclose(MX, MX2) print(MX2.columns) - return M, D def test_dataframe5(): @@ -130,7 +129,7 @@ def test_dataframe5(): D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=['A', 'E', ('D','E')], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = np.asarray(M.fit_transform(D)) # check they agree on copy of dataframe @@ -144,12 +143,12 @@ def test_dataframe6(): rng = np.random.default_rng(11) X = rng.standard_normal((50,5)) D = pd.DataFrame(X, columns=['A','B','C','D','E']) - W = Variable(('A','E'), 'AE', None) + W = Feature(('A','E'), 'AE', None) D['D'] = pd.Categorical(rng.choice(['a','b','c'], 50, replace=True)) D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=['A',W,(W,'D',)], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = M.fit_transform(D) MX = np.asarray(MX) @@ -163,7 +162,7 @@ def test_dataframe7(): D['Eee'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=D.columns.drop(['Y','C']), - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = M.fit_transform(D) print(MX.columns) MX = np.asarray(MX) @@ -178,9 +177,9 @@ def test_dataframe8(): poly = Poly(degree=3) # raises a ValueError because poly will have been already fit -- need new instance of Poly - W = Variable(('A',), 'poly(A)', poly) + W = Feature(('A',), 'poly(A)', poly) M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [(W,'E')], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = M.fit_transform(D) print(MX.columns) @@ -196,10 +195,10 @@ def test_dataframe9(): poly = Poly(degree=3) # raises a ValueError because poly will have been already fit -- need new instance of Poly - W = Variable(('A',), 'poly(A)', poly) - U = Variable(('B',), 'poly(B)', clone(poly)) + W = Feature(('A',), 'poly(A)', poly) + U = Feature(('B',), 'poly(B)', clone(poly)) M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [W,U], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = M.fit_transform(D) print(MX.columns) @@ -210,13 +209,13 @@ def test_dataframe10(): rng = np.random.default_rng(15) X = rng.standard_normal((50,5)) D = pd.DataFrame(X, columns=['A','B','C','D','E']) - W = Variable(('A','E'), 'AE', None) - U = Variable((W, 'C'), 'WC', None) + W = Feature(('A','E'), 'AE', None) + U = Feature((W, 'C'), 'WC', None) D['D'] = pd.Categorical(rng.choice(['a','b','c'], 50, replace=True)) D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True)) M = ModelSpec(terms=['A', 'E', 'C', W, (W, 'D',), U], - default_encoders=default_encoders) + categorical_encoders=default_encoders) MX = M.fit_transform(D) print(MX.columns) MX = np.asarray(MX) @@ -258,7 +257,11 @@ def test_submodel(): M.fit(D) MX = M.transform(D) - MXsub = M.build_submodel(D, M.terms[:2]) + MXsub = build_model(M.column_info_, + D, + M.terms[:2], + intercept=M.intercept, + encoders=M.encoders_) print(MX.columns) print(MXsub.columns) @@ -275,7 +278,11 @@ def test_contrast(): M.fit(D) MX = M.transform(D) - MXsub = M.build_submodel(D, M.terms[:2]) + MXsub = build_model(M.column_info_, + D, + M.terms[:2], + intercept=M.intercept, + encoders=M.encoders_) print(method, MX.columns) print(MXsub.columns) @@ -309,7 +316,7 @@ def test_pca(): X = rng.standard_normal((50,8)) D = pd.DataFrame(X, columns=['A','B','C','D','E', 'F', 'G', 'H']) - pca_ = Variable(('A','B','C','D'), 'pca(ABCD)', PCA(n_components=2)) + pca_ = Feature(('A','B','C','D'), 'pca(ABCD)', PCA(n_components=2)) M = ModelSpec(terms=[poly('F', intercept=True, degree=3), pca_]) diff --git a/ISLP/models/tests/test_selection.py b/tests/models/test_selection.py similarity index 100% rename from ISLP/models/tests/test_selection.py rename to tests/models/test_selection.py diff --git a/tests/models/test_sklearn_wrap.py b/tests/models/test_sklearn_wrap.py new file mode 100644 index 0000000..c3616bd --- /dev/null +++ b/tests/models/test_sklearn_wrap.py @@ -0,0 +1,46 @@ + +import numpy as np +import pandas as pd +import statsmodels.api as sm +from sklearn.base import is_classifier, is_regressor +import pytest + +from ISLP.models.sklearn_wrap import sklearn_sm, sklearn_selected +from ISLP.models.model_spec import ModelSpec +from ISLP.models.strategy import min_max + +@pytest.fixture +def model_setup(): + X = pd.DataFrame({'X1': np.random.rand(10), 'X2': np.random.rand(10), 'X3': np.random.rand(10)}) + y = pd.Series(np.random.randint(0, 2, 10)) # For classifier + model_spec_dummy = ModelSpec(['X1', 'X2', 'X3']).fit(X) + min_max_strategy_dummy = min_max(model_spec_dummy, min_terms=1, max_terms=2) + return X, y, model_spec_dummy, min_max_strategy_dummy + +def test_OLS_is_regressor(): + model = sklearn_sm(sm.OLS) + assert model.__sklearn_tags__().estimator_type == 'regressor' + assert is_regressor(model) + +def test_GLM_binomial_is_classifier(): + model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial()}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) + +def test_GLM_binomial_probit_is_classifier(): + model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial(link=sm.families.links.Probit())}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) + + +def test_selected_OLS_is_regressor(model_setup): + X, y, model_spec_dummy, min_max_strategy_dummy = model_setup + model = sklearn_selected(sm.OLS, strategy=min_max_strategy_dummy) + assert model.__sklearn_tags__().estimator_type == 'regressor' + assert is_regressor(model) + +def test_selected_GLM_binomial_is_classifier(model_setup): + X, y, model_spec_dummy, min_max_strategy_dummy = model_setup + model = sklearn_selected(sm.GLM, strategy=min_max_strategy_dummy, model_args={'family': sm.families.Binomial()}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..39f1447 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,30 @@ +# test that all datasets import + +from ISLP import load_data +import numpy as np +import pytest + +datasets = ['Auto', + 'Bikeshare', + 'Boston', + 'BrainCancer', + 'Caravan', + 'Carseats', + 'College', + 'Credit', + 'Default', + 'Fund', + 'Hitters', + 'NYSE', + 'OJ', + 'Portfolio', + 'Publication', + 'Smarket', + 'Wage', + 'Weekly'] + +@pytest.mark.parametrize('dataset', datasets) +def test_load(dataset): + df = load_data(dataset) + for col in df.columns: + assert df[col].dtype != np.dtype(object) diff --git a/torch_requirements.txt b/torch_requirements.txt deleted file mode 100644 index f3b355a..0000000 --- a/torch_requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -torch -torchvision -torchmetrics -torchdata -pytorch_lightning -torchinfo diff --git a/versioneer.py b/versioneer.py deleted file mode 100644 index b4cd1d6..0000000 --- a/versioneer.py +++ /dev/null @@ -1,2109 +0,0 @@ - -# Version: 0.21 - -"""The Versioneer - like a rocketeer, but for versions. - -The Versioneer -============== - -* like a rocketeer, but for versions! -* https://github.com/python-versioneer/python-versioneer -* Brian Warner -* License: Public Domain -* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3 -* [![Latest Version][pypi-image]][pypi-url] -* [![Build Status][travis-image]][travis-url] - -This is a tool for managing a recorded version number in distutils-based -python projects. The goal is to remove the tedious and error-prone "update -the embedded version string" step from your release process. Making a new -release should be as easy as recording a new tag in your version-control -system, and maybe making new tarballs. - - -## Quick Install - -* `pip install versioneer` to somewhere in your $PATH -* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) -* run `versioneer install` in your source tree, commit the results -* Verify version information with `python setup.py version` - -## Version Identifiers - -Source trees come from a variety of places: - -* a version-control system checkout (mostly used by developers) -* a nightly tarball, produced by build automation -* a snapshot tarball, produced by a web-based VCS browser, like github's - "tarball from tag" feature -* a release tarball, produced by "setup.py sdist", distributed through PyPI - -Within each source tree, the version identifier (either a string or a number, -this tool is format-agnostic) can come from a variety of places: - -* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows - about recent "tags" and an absolute revision-id -* the name of the directory into which the tarball was unpacked -* an expanded VCS keyword ($Id$, etc) -* a `_version.py` created by some earlier build step - -For released software, the version identifier is closely related to a VCS -tag. Some projects use tag names that include more than just the version -string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool -needs to strip the tag prefix to extract the version identifier. For -unreleased software (between tags), the version identifier should provide -enough information to help developers recreate the same tree, while also -giving them an idea of roughly how old the tree is (after version 1.2, before -version 1.3). Many VCS systems can report a description that captures this, -for example `git describe --tags --dirty --always` reports things like -"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the -0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has -uncommitted changes). - -The version identifier is used for multiple purposes: - -* to allow the module to self-identify its version: `myproject.__version__` -* to choose a name and prefix for a 'setup.py sdist' tarball - -## Theory of Operation - -Versioneer works by adding a special `_version.py` file into your source -tree, where your `__init__.py` can import it. This `_version.py` knows how to -dynamically ask the VCS tool for version information at import time. - -`_version.py` also contains `$Revision$` markers, and the installation -process marks `_version.py` to have this marker rewritten with a tag name -during the `git archive` command. As a result, generated tarballs will -contain enough information to get the proper version. - -To allow `setup.py` to compute a version too, a `versioneer.py` is added to -the top level of your source tree, next to `setup.py` and the `setup.cfg` -that configures it. This overrides several distutils/setuptools commands to -compute the version when invoked, and changes `setup.py build` and `setup.py -sdist` to replace `_version.py` with a small static file that contains just -the generated version data. - -## Installation - -See [INSTALL.md](./INSTALL.md) for detailed installation instructions. - -## Version-String Flavors - -Code which uses Versioneer can learn about its version string at runtime by -importing `_version` from your main `__init__.py` file and running the -`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can -import the top-level `versioneer.py` and run `get_versions()`. - -Both functions return a dictionary with different flavors of version -information: - -* `['version']`: A condensed version string, rendered using the selected - style. This is the most commonly used value for the project's version - string. The default "pep440" style yields strings like `0.11`, - `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section - below for alternative styles. - -* `['full-revisionid']`: detailed revision identifier. For Git, this is the - full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". - -* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the - commit date in ISO 8601 format. This will be None if the date is not - available. - -* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that - this is only accurate if run in a VCS checkout, otherwise it is likely to - be False or None - -* `['error']`: if the version string could not be computed, this will be set - to a string describing the problem, otherwise it will be None. It may be - useful to throw an exception in setup.py if this is set, to avoid e.g. - creating tarballs with a version string of "unknown". - -Some variants are more useful than others. Including `full-revisionid` in a -bug report should allow developers to reconstruct the exact code being tested -(or indicate the presence of local changes that should be shared with the -developers). `version` is suitable for display in an "about" box or a CLI -`--version` output: it can be easily compared against release notes and lists -of bugs fixed in various releases. - -The installer adds the following text to your `__init__.py` to place a basic -version in `YOURPROJECT.__version__`: - - from ._version import get_versions - __version__ = get_versions()['version'] - del get_versions - -## Styles - -The setup.cfg `style=` configuration controls how the VCS information is -rendered into a version string. - -The default style, "pep440", produces a PEP440-compliant string, equal to the -un-prefixed tag name for actual releases, and containing an additional "local -version" section with more detail for in-between builds. For Git, this is -TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags ---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the -tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and -that this commit is two revisions ("+2") beyond the "0.11" tag. For released -software (exactly equal to a known tag), the identifier will only contain the -stripped tag, e.g. "0.11". - -Other styles are available. See [details.md](details.md) in the Versioneer -source tree for descriptions. - -## Debugging - -Versioneer tries to avoid fatal errors: if something goes wrong, it will tend -to return a version of "0+unknown". To investigate the problem, run `setup.py -version`, which will run the version-lookup code in a verbose mode, and will -display the full contents of `get_versions()` (including the `error` string, -which may help identify what went wrong). - -## Known Limitations - -Some situations are known to cause problems for Versioneer. This details the -most significant ones. More can be found on Github -[issues page](https://github.com/python-versioneer/python-versioneer/issues). - -### Subprojects - -Versioneer has limited support for source trees in which `setup.py` is not in -the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are -two common reasons why `setup.py` might not be in the root: - -* Source trees which contain multiple subprojects, such as - [Buildbot](https://github.com/buildbot/buildbot), which contains both - "master" and "slave" subprojects, each with their own `setup.py`, - `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI - distributions (and upload multiple independently-installable tarballs). -* Source trees whose main purpose is to contain a C library, but which also - provide bindings to Python (and perhaps other languages) in subdirectories. - -Versioneer will look for `.git` in parent directories, and most operations -should get the right version string. However `pip` and `setuptools` have bugs -and implementation details which frequently cause `pip install .` from a -subproject directory to fail to find a correct version string (so it usually -defaults to `0+unknown`). - -`pip install --editable .` should work correctly. `setup.py install` might -work too. - -Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in -some later version. - -[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking -this issue. The discussion in -[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the -issue from the Versioneer side in more detail. -[pip PR#3176](https://github.com/pypa/pip/pull/3176) and -[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve -pip to let Versioneer work correctly. - -Versioneer-0.16 and earlier only looked for a `.git` directory next to the -`setup.cfg`, so subprojects were completely unsupported with those releases. - -### Editable installs with setuptools <= 18.5 - -`setup.py develop` and `pip install --editable .` allow you to install a -project into a virtualenv once, then continue editing the source code (and -test) without re-installing after every change. - -"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a -convenient way to specify executable scripts that should be installed along -with the python package. - -These both work as expected when using modern setuptools. When using -setuptools-18.5 or earlier, however, certain operations will cause -`pkg_resources.DistributionNotFound` errors when running the entrypoint -script, which must be resolved by re-installing the package. This happens -when the install happens with one version, then the egg_info data is -regenerated while a different version is checked out. Many setup.py commands -cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into -a different virtualenv), so this can be surprising. - -[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes -this one, but upgrading to a newer version of setuptools should probably -resolve it. - - -## Updating Versioneer - -To upgrade your project to a new release of Versioneer, do the following: - -* install the new Versioneer (`pip install -U versioneer` or equivalent) -* edit `setup.cfg`, if necessary, to include any new configuration settings - indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. -* re-run `versioneer install` in your source tree, to replace - `SRC/_version.py` -* commit any changed files - -## Future Directions - -This tool is designed to make it easily extended to other version-control -systems: all VCS-specific components are in separate directories like -src/git/ . The top-level `versioneer.py` script is assembled from these -components by running make-versioneer.py . In the future, make-versioneer.py -will take a VCS name as an argument, and will construct a version of -`versioneer.py` that is specific to the given VCS. It might also take the -configuration arguments that are currently provided manually during -installation by editing setup.py . Alternatively, it might go the other -direction and include code from all supported VCS systems, reducing the -number of intermediate scripts. - -## Similar projects - -* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time - dependency -* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of - versioneer -* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools - plugin - -## License - -To make Versioneer easier to embed, all its code is dedicated to the public -domain. The `_version.py` that it creates is also in the public domain. -Specifically, both are released under the Creative Commons "Public Domain -Dedication" license (CC0-1.0), as described in -https://creativecommons.org/publicdomain/zero/1.0/ . - -[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg -[pypi-url]: https://pypi.python.org/pypi/versioneer/ -[travis-image]: -https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg -[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer - -""" -# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring -# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements -# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error -# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with -# pylint:disable=attribute-defined-outside-init,too-many-arguments - -import configparser -import errno -import json -import os -import re -import subprocess -import sys -from typing import Callable, Dict - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_root(): - """Get the project root directory. - - We require that all commands are run from the project root, i.e. the - directory that contains setup.py, setup.cfg, and versioneer.py . - """ - root = os.path.realpath(os.path.abspath(os.getcwd())) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - # allow 'python path/to/setup.py COMMAND' - root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") - raise VersioneerBadRootError(err) - try: - # Certain runtime workflows (setup.py install/develop in a setuptools - # tree) execute all dependencies in a single python process, so - # "versioneer" may be imported multiple times, and python's shared - # module-import table will cache the first one. So we can't use - # os.path.dirname(__file__), as that will find whichever - # versioneer.py was first imported, even in later projects. - my_path = os.path.realpath(os.path.abspath(__file__)) - me_dir = os.path.normcase(os.path.splitext(my_path)[0]) - vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) - if me_dir != vsr_dir: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(my_path), versioneer_py)) - except NameError: - pass - return root - - -def get_config_from_root(root): - """Read the project setup.cfg file to determine Versioneer config.""" - # This might raise OSError (if setup.cfg is missing), or - # configparser.NoSectionError (if it lacks a [versioneer] section), or - # configparser.NoOptionError (if it lacks "VCS="). See the docstring at - # the top of versioneer.py for instructions on writing your setup.cfg . - setup_cfg = os.path.join(root, "setup.cfg") - parser = configparser.ConfigParser() - with open(setup_cfg, "r") as cfg_file: - parser.read_file(cfg_file) - VCS = parser.get("versioneer", "VCS") # mandatory - - # Dict-like interface for non-mandatory entries - section = parser["versioneer"] - - cfg = VersioneerConfig() - cfg.VCS = VCS - cfg.style = section.get("style", "") - cfg.versionfile_source = section.get("versionfile_source") - cfg.versionfile_build = section.get("versionfile_build") - cfg.tag_prefix = section.get("tag_prefix") - if cfg.tag_prefix in ("''", '""'): - cfg.tag_prefix = "" - cfg.parentdir_prefix = section.get("parentdir_prefix") - cfg.verbose = section.get("verbose") - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -# these dictionaries contain VCS-specific tools -LONG_VERSION_PY: Dict[str, str] = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} - - -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - HANDLERS.setdefault(vcs, {})[method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - process = None - for command in commands: - try: - dispcmd = str([command] + args) - # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except OSError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None, None - stdout = process.communicate()[0].strip().decode() - if process.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - print("stdout was %s" % stdout) - return None, process.returncode - return stdout, process.returncode - - -LONG_VERSION_PY['git'] = r''' -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) - -"""Git implementation of _version.py.""" - -import errno -import os -import re -import subprocess -import sys -from typing import Callable, Dict - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" - git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" - git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" - keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "%(STYLE)s" - cfg.tag_prefix = "%(TAG_PREFIX)s" - cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" - cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY: Dict[str, str] = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} - - -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - process = None - for command in commands: - try: - dispcmd = str([command] + args) - # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except OSError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %%s" %% dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %%s" %% (commands,)) - return None, None - stdout = process.communicate()[0].strip().decode() - if process.returncode != 0: - if verbose: - print("unable to run %%s (error)" %% dispcmd) - print("stdout was %%s" %% stdout) - return None, process.returncode - return stdout, process.returncode - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for _ in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print("Tried directories %%s but none started with prefix %%s" %% - (str(rootdirs), parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - with open(versionfile_abs, "r") as fobj: - for line in fobj: - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - except OSError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if "refnames" not in keywords: - raise NotThisMethod("Short version file found") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = {r.strip() for r in refnames.strip("()").split(",")} - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %%d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} - if verbose: - print("discarding '%%s', no digits" %% ",".join(refs - tags)) - if verbose: - print("likely tags: %%s" %% ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - # Filter out refs that exactly match prefix or that don't start - # with a number once the prefix is stripped (mostly a concern - # when prefix is '') - if not re.match(r'\d', r): - continue - if verbose: - print("picking %%s" %% r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - TAG_PREFIX_REGEX = "*" - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - TAG_PREFIX_REGEX = r"\*" - - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) - if rc != 0: - if verbose: - print("Directory %%s not under git control" %% root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", - "%%s%%s" %% (tag_prefix, TAG_PREFIX_REGEX)], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) - # --abbrev-ref was added in git-1.6.3 - if rc != 0 or branch_name is None: - raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") - branch_name = branch_name.strip() - - if branch_name == "HEAD": - # If we aren't exactly on a branch, pick a branch which represents - # the current commit. If all else fails, we are on a branchless - # commit. - branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) - # --contains was added in git-1.5.4 - if rc != 0 or branches is None: - raise NotThisMethod("'git branch --contains' returned error") - branches = branches.split("\n") - - # Remove the first line if we're running detached - if "(" in branches[0]: - branches.pop(0) - - # Strip off the leading "* " from the list of branches. - branches = [branch[2:] for branch in branches] - if "master" in branches: - branch_name = "master" - elif not branches: - branch_name = None - else: - # Pick the first branch that is returned. Good or bad. - branch_name = branches[0] - - pieces["branch"] = branch_name - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%%s'" - %% describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%%s' doesn't start with prefix '%%s'" - print(fmt %% (full_tag, tag_prefix)) - pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" - %% (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_branch(pieces): - """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . - - The ".dev0" means not master branch. Note that .dev0 sorts backwards - (a feature branch will appear "older" than the master branch). - - Exceptions: - 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0" - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def pep440_split_post(ver): - """Split pep440 version string at the post-release segment. - - Returns the release segments before the post-release and the - post-release version number (or -1 if no post-release segment is present). - """ - vc = str.split(ver, ".post") - return vc[0], int(vc[1] or 0) if len(vc) == 2 else None - - -def render_pep440_pre(pieces): - """TAG[.postN.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ - if pieces["closest-tag"]: - if pieces["distance"]: - # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) - rendered = tag_version - if post_version is not None: - rendered += ".post%%d.dev%%d" %% (post_version+1, pieces["distance"]) - else: - rendered += ".post0.dev%%d" %% (pieces["distance"]) - else: - # no commits, use the tag as the version - rendered = pieces["closest-tag"] - else: - # exception #1 - rendered = "0.post0.dev%%d" %% pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%%s" %% pieces["short"] - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%%s" %% pieces["short"] - return rendered - - -def render_pep440_post_branch(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . - - The ".dev0" means not master branch. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%%s" %% pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+g%%s" %% pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-branch": - rendered = render_pep440_branch(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-post-branch": - rendered = render_pep440_post_branch(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%%s'" %% style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for _ in cfg.versionfile_source.split('/'): - root = os.path.dirname(root) - except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} -''' - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - with open(versionfile_abs, "r") as fobj: - for line in fobj: - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - except OSError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if "refnames" not in keywords: - raise NotThisMethod("Short version file found") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = {r.strip() for r in refnames.strip("()").split(",")} - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} - if verbose: - print("discarding '%s', no digits" % ",".join(refs - tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - # Filter out refs that exactly match prefix or that don't start - # with a number once the prefix is stripped (mostly a concern - # when prefix is '') - if not re.match(r'\d', r): - continue - if verbose: - print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - TAG_PREFIX_REGEX = "*" - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - TAG_PREFIX_REGEX = r"\*" - - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) - if rc != 0: - if verbose: - print("Directory %s not under git control" % root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", - "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) - # --abbrev-ref was added in git-1.6.3 - if rc != 0 or branch_name is None: - raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") - branch_name = branch_name.strip() - - if branch_name == "HEAD": - # If we aren't exactly on a branch, pick a branch which represents - # the current commit. If all else fails, we are on a branchless - # commit. - branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) - # --contains was added in git-1.5.4 - if rc != 0 or branches is None: - raise NotThisMethod("'git branch --contains' returned error") - branches = branches.split("\n") - - # Remove the first line if we're running detached - if "(" in branches[0]: - branches.pop(0) - - # Strip off the leading "* " from the list of branches. - branches = [branch[2:] for branch in branches] - if "master" in branches: - branch_name = "master" - elif not branches: - branch_name = None - else: - # Pick the first branch that is returned. Good or bad. - branch_name = branches[0] - - pieces["branch"] = branch_name - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def do_vcs_install(manifest_in, versionfile_source, ipy): - """Git-specific installation logic for Versioneer. - - For Git, this means creating/changing .gitattributes to mark _version.py - for export-subst keyword substitution. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - files = [manifest_in, versionfile_source] - if ipy: - files.append(ipy) - try: - my_path = __file__ - if my_path.endswith(".pyc") or my_path.endswith(".pyo"): - my_path = os.path.splitext(my_path)[0] + ".py" - versioneer_file = os.path.relpath(my_path) - except NameError: - versioneer_file = "versioneer.py" - files.append(versioneer_file) - present = False - try: - with open(".gitattributes", "r") as fobj: - for line in fobj: - if line.strip().startswith(versionfile_source): - if "export-subst" in line.strip().split()[1:]: - present = True - break - except OSError: - pass - if not present: - with open(".gitattributes", "a+") as fobj: - fobj.write(f"{versionfile_source} export-subst\n") - files.append(".gitattributes") - run_command(GITS, ["add", "--"] + files) - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for _ in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -SHORT_VERSION_PY = """ -# This file was generated by 'versioneer.py' (0.21) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -import json - -version_json = ''' -%s -''' # END VERSION_JSON - - -def get_versions(): - return json.loads(version_json) -""" - - -def versions_from_file(filename): - """Try to determine the version from _version.py if present.""" - try: - with open(filename) as f: - contents = f.read() - except OSError: - raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) - if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) - if not mo: - raise NotThisMethod("no version_json in _version.py") - return json.loads(mo.group(1)) - - -def write_to_version_file(filename, versions): - """Write the given version number to the given _version.py file.""" - os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) - with open(filename, "w") as f: - f.write(SHORT_VERSION_PY % contents) - - print("set %s to '%s'" % (filename, versions["version"])) - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_branch(pieces): - """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . - - The ".dev0" means not master branch. Note that .dev0 sorts backwards - (a feature branch will appear "older" than the master branch). - - Exceptions: - 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0" - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def pep440_split_post(ver): - """Split pep440 version string at the post-release segment. - - Returns the release segments before the post-release and the - post-release version number (or -1 if no post-release segment is present). - """ - vc = str.split(ver, ".post") - return vc[0], int(vc[1] or 0) if len(vc) == 2 else None - - -def render_pep440_pre(pieces): - """TAG[.postN.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ - if pieces["closest-tag"]: - if pieces["distance"]: - # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) - rendered = tag_version - if post_version is not None: - rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) - else: - rendered += ".post0.dev%d" % (pieces["distance"]) - else: - # no commits, use the tag as the version - rendered = pieces["closest-tag"] - else: - # exception #1 - rendered = "0.post0.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_post_branch(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . - - The ".dev0" means not master branch. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["branch"] != "master": - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-branch": - rendered = render_pep440_branch(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-post-branch": - rendered = render_pep440_post_branch(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} - - -class VersioneerBadRootError(Exception): - """The project root directory is unknown or missing key files.""" - - -def get_versions(verbose=False): - """Get the project version from whatever source is available. - - Returns dict with two keys: 'version' and 'full'. - """ - if "versioneer" in sys.modules: - # see the discussion in cmdclass.py:get_cmdclass() - del sys.modules["versioneer"] - - root = get_root() - cfg = get_config_from_root(root) - - assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" - handlers = HANDLERS.get(cfg.VCS) - assert handlers, "unrecognized VCS '%s'" % cfg.VCS - verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" - assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" - - versionfile_abs = os.path.join(root, cfg.versionfile_source) - - # extract version from first of: _version.py, VCS command (e.g. 'git - # describe'), parentdir. This is meant to work for developers using a - # source checkout, for users of a tarball created by 'setup.py sdist', - # and for users of a tarball/zipball created by 'git archive' or github's - # download-from-tag feature or the equivalent in other VCSes. - - get_keywords_f = handlers.get("get_keywords") - from_keywords_f = handlers.get("keywords") - if get_keywords_f and from_keywords_f: - try: - keywords = get_keywords_f(versionfile_abs) - ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) - if verbose: - print("got version from expanded keyword %s" % ver) - return ver - except NotThisMethod: - pass - - try: - ver = versions_from_file(versionfile_abs) - if verbose: - print("got version from file %s %s" % (versionfile_abs, ver)) - return ver - except NotThisMethod: - pass - - from_vcs_f = handlers.get("pieces_from_vcs") - if from_vcs_f: - try: - pieces = from_vcs_f(cfg.tag_prefix, root, verbose) - ver = render(pieces, cfg.style) - if verbose: - print("got version from VCS %s" % ver) - return ver - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - if verbose: - print("got version from parentdir %s" % ver) - return ver - except NotThisMethod: - pass - - if verbose: - print("unable to compute version") - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} - - -def get_version(): - """Get the short version string for this project.""" - return get_versions()["version"] - - -def get_cmdclass(cmdclass=None): - """Get the custom setuptools/distutils subclasses used by Versioneer. - - If the package uses a different cmdclass (e.g. one from numpy), it - should be provide as an argument. - """ - if "versioneer" in sys.modules: - del sys.modules["versioneer"] - # this fixes the "python setup.py develop" case (also 'install' and - # 'easy_install .'), in which subdependencies of the main project are - # built (using setup.py bdist_egg) in the same python process. Assume - # a main project A and a dependency B, which use different versions - # of Versioneer. A's setup.py imports A's Versioneer, leaving it in - # sys.modules by the time B's setup.py is executed, causing B to run - # with the wrong versioneer. Setuptools wraps the sub-dep builds in a - # sandbox that restores sys.modules to it's pre-build state, so the - # parent is protected against the child's "import versioneer". By - # removing ourselves from sys.modules here, before the child build - # happens, we protect the child from the parent's versioneer too. - # Also see https://github.com/python-versioneer/python-versioneer/issues/52 - - cmds = {} if cmdclass is None else cmdclass.copy() - - # we add "version" to both distutils and setuptools - from distutils.core import Command - - class cmd_version(Command): - description = "report generated version string" - user_options = [] - boolean_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - vers = get_versions(verbose=True) - print("Version: %s" % vers["version"]) - print(" full-revisionid: %s" % vers.get("full-revisionid")) - print(" dirty: %s" % vers.get("dirty")) - print(" date: %s" % vers.get("date")) - if vers["error"]: - print(" error: %s" % vers["error"]) - cmds["version"] = cmd_version - - # we override "build_py" in both distutils and setuptools - # - # most invocation pathways end up running build_py: - # distutils/build -> build_py - # distutils/install -> distutils/build ->.. - # setuptools/bdist_wheel -> distutils/install ->.. - # setuptools/bdist_egg -> distutils/install_lib -> build_py - # setuptools/install -> bdist_egg ->.. - # setuptools/develop -> ? - # pip install: - # copies source tree to a tempdir before running egg_info/etc - # if .git isn't copied too, 'git describe' will fail - # then does setup.py bdist_wheel, or sometimes setup.py install - # setup.py egg_info -> ? - - # we override different "build_py" commands for both environments - if 'build_py' in cmds: - _build_py = cmds['build_py'] - elif "setuptools" in sys.modules: - from setuptools.command.build_py import build_py as _build_py - else: - from distutils.command.build_py import build_py as _build_py - - class cmd_build_py(_build_py): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - _build_py.run(self) - # now locate _version.py in the new build/ directory and replace - # it with an updated value - if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - cmds["build_py"] = cmd_build_py - - if 'build_ext' in cmds: - _build_ext = cmds['build_ext'] - elif "setuptools" in sys.modules: - from setuptools.command.build_ext import build_ext as _build_ext - else: - from distutils.command.build_ext import build_ext as _build_ext - - class cmd_build_ext(_build_ext): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - _build_ext.run(self) - if self.inplace: - # build_ext --inplace will only build extensions in - # build/lib<..> dir with no _version.py to write to. - # As in place builds will already have a _version.py - # in the module dir, we do not need to write one. - return - # now locate _version.py in the new build/ directory and replace - # it with an updated value - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - cmds["build_ext"] = cmd_build_ext - - if "cx_Freeze" in sys.modules: # cx_freeze enabled? - from cx_Freeze.dist import build_exe as _build_exe - # nczeczulin reports that py2exe won't like the pep440-style string - # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. - # setup(console=[{ - # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION - # "product_version": versioneer.get_version(), - # ... - - class cmd_build_exe(_build_exe): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - target_versionfile = cfg.versionfile_source - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - _build_exe.run(self) - os.unlink(target_versionfile) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - cmds["build_exe"] = cmd_build_exe - del cmds["build_py"] - - if 'py2exe' in sys.modules: # py2exe enabled? - from py2exe.distutils_buildexe import py2exe as _py2exe - - class cmd_py2exe(_py2exe): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - target_versionfile = cfg.versionfile_source - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - _py2exe.run(self) - os.unlink(target_versionfile) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - cmds["py2exe"] = cmd_py2exe - - # we override different "sdist" commands for both environments - if 'sdist' in cmds: - _sdist = cmds['sdist'] - elif "setuptools" in sys.modules: - from setuptools.command.sdist import sdist as _sdist - else: - from distutils.command.sdist import sdist as _sdist - - class cmd_sdist(_sdist): - def run(self): - versions = get_versions() - self._versioneer_generated_versions = versions - # unless we update this, the command will keep using the old - # version - self.distribution.metadata.version = versions["version"] - return _sdist.run(self) - - def make_release_tree(self, base_dir, files): - root = get_root() - cfg = get_config_from_root(root) - _sdist.make_release_tree(self, base_dir, files) - # now locate _version.py in the new base_dir directory - # (remembering that it may be a hardlink) and replace it with an - # updated value - target_versionfile = os.path.join(base_dir, cfg.versionfile_source) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) - cmds["sdist"] = cmd_sdist - - return cmds - - -CONFIG_ERROR = """ -setup.cfg is missing the necessary Versioneer configuration. You need -a section like: - - [versioneer] - VCS = git - style = pep440 - versionfile_source = src/myproject/_version.py - versionfile_build = myproject/_version.py - tag_prefix = - parentdir_prefix = myproject- - -You will also need to edit your setup.py to use the results: - - import versioneer - setup(version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), ...) - -Please read the docstring in ./versioneer.py for configuration instructions, -edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. -""" - -SAMPLE_CONFIG = """ -# See the docstring in versioneer.py for instructions. Note that you must -# re-run 'versioneer.py setup' after changing this section, and commit the -# resulting files. - -[versioneer] -#VCS = git -#style = pep440 -#versionfile_source = -#versionfile_build = -#tag_prefix = -#parentdir_prefix = - -""" - -OLD_SNIPPET = """ -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions -""" - -INIT_PY_SNIPPET = """ -from . import {0} -__version__ = {0}.get_versions()['version'] -""" - - -def do_setup(): - """Do main VCS-independent setup function for installing Versioneer.""" - root = get_root() - try: - cfg = get_config_from_root(root) - except (OSError, configparser.NoSectionError, - configparser.NoOptionError) as e: - if isinstance(e, (OSError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) - with open(os.path.join(root, "setup.cfg"), "a") as f: - f.write(SAMPLE_CONFIG) - print(CONFIG_ERROR, file=sys.stderr) - return 1 - - print(" creating %s" % cfg.versionfile_source) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") - if os.path.exists(ipy): - try: - with open(ipy, "r") as f: - old = f.read() - except OSError: - old = "" - module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] - snippet = INIT_PY_SNIPPET.format(module) - if OLD_SNIPPET in old: - print(" replacing boilerplate in %s" % ipy) - with open(ipy, "w") as f: - f.write(old.replace(OLD_SNIPPET, snippet)) - elif snippet not in old: - print(" appending to %s" % ipy) - with open(ipy, "a") as f: - f.write(snippet) - else: - print(" %s unmodified" % ipy) - else: - print(" %s doesn't exist, ok" % ipy) - ipy = None - - # Make sure both the top-level "versioneer.py" and versionfile_source - # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so - # they'll be copied into source distributions. Pip won't be able to - # install the package without this. - manifest_in = os.path.join(root, "MANIFEST.in") - simple_includes = set() - try: - with open(manifest_in, "r") as f: - for line in f: - if line.startswith("include "): - for include in line.split()[1:]: - simple_includes.add(include) - except OSError: - pass - # That doesn't cover everything MANIFEST.in can do - # (http://docs.python.org/2/distutils/sourcedist.html#commands), so - # it might give some false negatives. Appending redundant 'include' - # lines is safe, though. - if "versioneer.py" not in simple_includes: - print(" appending 'versioneer.py' to MANIFEST.in") - with open(manifest_in, "a") as f: - f.write("include versioneer.py\n") - else: - print(" 'versioneer.py' already in MANIFEST.in") - if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) - with open(manifest_in, "a") as f: - f.write("include %s\n" % cfg.versionfile_source) - else: - print(" versionfile_source already in MANIFEST.in") - - # Make VCS-specific changes. For git, this means creating/changing - # .gitattributes to mark _version.py for export-subst keyword - # substitution. - do_vcs_install(manifest_in, cfg.versionfile_source, ipy) - return 0 - - -def scan_setup_py(): - """Validate the contents of setup.py against Versioneer's expectations.""" - found = set() - setters = False - errors = 0 - with open("setup.py", "r") as f: - for line in f.readlines(): - if "import versioneer" in line: - found.add("import") - if "versioneer.get_cmdclass()" in line: - found.add("cmdclass") - if "versioneer.get_version()" in line: - found.add("get_version") - if "versioneer.VCS" in line: - setters = True - if "versioneer.versionfile_source" in line: - setters = True - if len(found) != 3: - print("") - print("Your setup.py appears to be missing some important items") - print("(but I might be wrong). Please make sure it has something") - print("roughly like the following:") - print("") - print(" import versioneer") - print(" setup( version=versioneer.get_version(),") - print(" cmdclass=versioneer.get_cmdclass(), ...)") - print("") - errors += 1 - if setters: - print("You should remove lines like 'versioneer.VCS = ' and") - print("'versioneer.versionfile_source = ' . This configuration") - print("now lives in setup.cfg, and should be removed from setup.py") - print("") - errors += 1 - return errors - - -if __name__ == "__main__": - cmd = sys.argv[1] - if cmd == "setup": - errors = do_setup() - errors += scan_setup_py() - if errors: - sys.exit(1)