Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 101 additions & 41 deletions analysis.ipynb

Large diffs are not rendered by default.

996 changes: 996 additions & 0 deletions models.ipynb

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ appnope==0.1.0
attrs==19.1.0
backcall==0.1.0
bleach==3.1.0
cycler==0.10.0
decorator==4.4.0
defusedxml==0.5.0
entrypoints==0.3
Expand All @@ -15,11 +16,15 @@ jupyter-client==5.2.4
jupyter-core==4.4.0
jupyterlab==0.35.4
jupyterlab-server==0.2.0
kiwisolver==1.0.1
MarkupSafe==1.1.1
matplotlib==3.0.3
mistune==0.8.4
nbconvert==5.4.1
nbformat==4.4.0
notebook==5.7.8
numpy==1.16.2
pandas==0.24.2
pandocfilters==1.4.2
parso==0.4.0
pexpect==4.7.0
Expand All @@ -28,9 +33,13 @@ prometheus-client==0.6.0
prompt-toolkit==2.0.9
ptyprocess==0.6.0
Pygments==2.3.1
pyparsing==2.4.0
pyrsistent==0.14.11
python-dateutil==2.8.0
pytz==2018.9
pyzmq==18.0.1
scikit-learn==0.20.3
scipy==1.2.1
Send2Trash==1.5.0
six==1.12.0
terminado==0.8.2
Expand Down
10 changes: 5 additions & 5 deletions utils/ContinuityImputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ def fit(self, X, y=None):
X_iq = X[X['city'] == 'iq']
X_sj = X[X['city'] == 'sj']

self.medians_iq = {attr: np.nanmedian(X_iq[attr]) for attr in self.attributes}
self.medians_sj = {attr: np.nanmedian(X_sj[attr]) for attr in self.attributes}
medians_iq = {attr: np.nanmedian(X_iq[attr]) for attr in self.attributes}
medians_sj = {attr: np.nanmedian(X_sj[attr]) for attr in self.attributes}
self.last_values = {'sj': medians_sj, 'iq': medians_iq}

return self

Expand All @@ -20,14 +21,13 @@ def transform(self, X):
X = X.copy()

for attr in self.attributes:
last_values = {'sj': self.medians_sj[attr], 'iq': self.medians_iq[attr]}
r = []
for _, curr in X.iterrows():
city = curr['city']
val = curr[attr]
if val is not None and not np.isnan(val):
last_values[city] = val
r.append(last_values[city])
self.last_values[city][attr] = val
r.append(self.last_values[city][attr])
X[attr] = r

return X
19 changes: 19 additions & 0 deletions utils/DataFrameDropper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from sklearn.base import BaseEstimator, TransformerMixin
import pandas

class DataFrameDropper(BaseEstimator, TransformerMixin):

def __init__(self, attribute_names, copy = True):
self.attribute_names = attribute_names
self.copy = copy

def fit(self, X, y=None):
return self

def transform(self, X):
if self.copy:
X = X.copy()
if isinstance(X, pandas.core.frame.DataFrame):
return X.drop(self.attribute_names, axis = 1)

raise ValueError('You try to drop some columns from something which is not a DataFrame')
11 changes: 11 additions & 0 deletions utils/DataFrameSelector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):

def __init__(self, attribute_names):
self.attribute_names = attribute_names

def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
53 changes: 53 additions & 0 deletions utils/LastInfected.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from sklearn.base import BaseEstimator, TransformerMixin
from collections import deque
import numpy as np
import pandas as pd
from random import gauss, choice

class LastInfected(BaseEstimator, TransformerMixin):
def __init__(self, weeks=1, new_attributes_prefix='last_infected_', add_noise=False, noise_mean=None, noise_std=None, copy=True):
self.weeks=weeks
self.new_attributes_prefix = new_attributes_prefix
self.copy=copy
dq = deque([0 for _ in range(weeks)])
self.last = {'sj': dq.copy(), 'iq': dq.copy()}
self.add_noise = add_noise
self.noise_mean = noise_mean
self.noise_std = noise_std


def fit(self, X, y):
self.first = True
self.y = y.to_list()
return self

def transform(self, X, model=None):
if self.copy:
X = X.copy()

X.reset_index(drop=True, inplace=True)

r = np.ndarray(shape=[X.shape[0], self.weeks])

for idx, x in X.iterrows():
self.city = x['city']
r[idx] = self.last[self.city]
if self.first:
self.append_y(self.y[idx])

r = pd.DataFrame(r, columns=[self.new_attributes_prefix + str(week) for week in range(self.weeks)])

X = pd.concat([X, r], axis=1)

self.first=False

return X

def append_y(self, new_y):
if self.add_noise:
noise = int(np.round(choice([-1,1]) * gauss(mu=self.noise_mean, sigma=self.noise_std)))
new_y += noise
#if new_y < 0:
#new_y = 0
self.last[self.city].appendleft(new_y)
self.last[self.city].pop()
43 changes: 43 additions & 0 deletions utils/LastWeeks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from sklearn.base import BaseEstimator, TransformerMixin
from collections import deque
import numpy as np
import pandas as pd


class LastWeeks(BaseEstimator, TransformerMixin):
def __init__(self, attributes, weeks=2, new_attributes_prefix='last_weeks_', copy=True):
self.attributes = attributes
self.weeks = weeks
self.new_attributes_prefix = new_attributes_prefix
self.copy = copy

def fit(self, X, y=None):
attr_medians = [np.nanmedian(X[attr]) for attr in self.attributes]
dq = deque([attr_medians for _ in range(self.weeks)])
self.last = {'sj': dq.copy(), 'iq': dq.copy()}

return self

def transform(self, X):
if self.copy:
X = X.copy()

X.reset_index(drop=True, inplace=True)

r = np.ndarray(shape=[X.shape[0], self.weeks, len(self.attributes)])

for idx, week in X.iterrows():
city = week['city']
r[idx] = self.last[city]
self.last[city].appendleft(week[self.attributes])
self.last[city].pop()

r = pd.DataFrame(r.reshape([X.shape[0], self.weeks * len(self.attributes)]),
columns=[self.new_attributes_prefix + str(week) + '_' + str(attr)
for week in range(self.weeks)
for attr in self.attributes
])

X = pd.concat([X, r], axis=1)

return X
21 changes: 21 additions & 0 deletions utils/NoiseRemover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from sklearn.base import BaseEstimator, TransformerMixin

class NoiseRemover(BaseEstimator, TransformerMixin):
def __init__(self, noisy_weeks=53, copy=True):
self.noisy_weeks = noisy_weeks
self.train_set = True
self.copy = copy

def fit(self, X, y=None):
return self

def transform(self, X):
if self.train_set:
if self.copy:
X = X.copy()
X = X[X['weekofyear'] != self.noisy_weeks]
self.train_set = False

return X


24 changes: 24 additions & 0 deletions utils/OurPipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from utils.ContinuityImputer import ContinuityImputer
from utils.DataFrameDropper import DataFrameDropper
from utils.LastWeeks import LastWeeks
from utils.LastInfected import LastInfected

def create_pipeline(attr, n_weeks, n_weeks_infected=None, estimator_optimizer=None, pca=None, add_noise=False, noise_mean=None, noise_std=None, n_non_train=4):

l_infected = None
if n_weeks_infected is not None and n_weeks_infected > 0:
l_infected = LastInfected(weeks=n_weeks_infected, add_noise=add_noise, noise_mean=noise_mean, noise_std=noise_std)

return Pipeline([
('imputer', ContinuityImputer(attributes=attr[n_non_train:])),
('l_weeks', LastWeeks(attributes=attr[n_non_train:], weeks=n_weeks)),
('l_infected', l_infected),
('dataframe_dropper', DataFrameDropper(attribute_names=attr[:n_non_train])),
('scaler', StandardScaler()),
('pca', pca),
('est_opt', estimator_optimizer),
]
)
Empty file added utils/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions utils/predict_in_order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import numpy as np

def predict_in_order(X, model, pipeline):
#X_test_f = pd.DataFrame([], columns=attr[4:])
predictions=[]
for idx in range(X.shape[0]):
x = pipeline.transform(X.loc[idx:idx,:])
#X_test_f = X_test_f.append(x, sort=False, ignore_index=True)
pred = model.predict(x)
pred = int(np.round(pred))
pipeline.named_steps['l_infected'].append_y(pred)
predictions.append(pred)

return predictions