Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
runs
catboost_info
*.pyc
__pycache__/
*.log
*.whl
*.gz
*.__pycache__
build/
dist/
*egg-info*/
Expand Down
8 changes: 4 additions & 4 deletions alphapy/alphapy_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,13 @@ def training_pipeline(alphapy_specs, model):
logger.info("Original Feature Statistics")
logger.info("Number of Training Rows : %d", X_train.shape[0])
logger.info("Number of Training Columns : %d", X_train.shape[1])
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
uv, uc = np.unique(y_train, return_counts=True)
logger.info("Unique Training Values for %s : %s", target, uv)
logger.info("Unique Training Counts for %s : %s", target, uc)
logger.info("Number of Testing Rows : %d", X_test.shape[0])
logger.info("Number of Testing Columns : %d", X_test.shape[1])
if (model_type == ModelType.classification or model_type == ModelType.system) and model.test_labels:
if (model_type == ModelType.classification) and model.test_labels:
uv, uc = np.unique(y_test, return_counts=True)
logger.info("Unique Testing Values for %s : %s", target, uv)
logger.info("Unique Testing Counts for %s : %s", target, uc)
Expand Down Expand Up @@ -302,7 +302,7 @@ def training_pipeline(alphapy_specs, model):

# Create crosstabs for any categorical features

if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
create_crosstabs(model, target)

# Create initial features
Expand Down Expand Up @@ -524,7 +524,7 @@ def prediction_pipeline(alphapy_specs, model):
logger.info("Making Predictions")
tag = 'BEST'
model.preds[(tag, partition)] = predictor.predict(X_all)
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
model.probas[(tag, partition)] = predictor.predict_proba(X_all)[:, 1]

# Save predictions
Expand Down
103 changes: 103 additions & 0 deletions alphapy/calendrical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,3 +1271,106 @@ def set_holidays(gyear, observe):
else:
holidays[h] = hfunc(gyear)
return holidays


#
# Function dateparts
#

def dateparts(df, c):
r"""Extract date into its components: year, month, day, dayofweek.

Parameters
----------
df : pandas.DataFrame
Dataframe containing the column ``c``.
c : str
Name of the column in the dataframe ``df``.

Returns
-------
date_features : pandas.DataFrame
The dataframe containing the date features.
"""

col = df[c]
if hasattr(col, "to_pandas"):
col = col.to_pandas()
ds_dt = pd.to_datetime(col)
date_features = pd.DataFrame()
try:
fyear = pd.Series(ds_dt.dt.year, name='year').astype(int)
fmonth = pd.Series(ds_dt.dt.month, name='month').astype(int)
fday = pd.Series(ds_dt.dt.day, name='day').astype(int)
fdow = pd.Series(ds_dt.dt.dayofweek, name='dayofweek').astype(int)
frames = [fyear, fmonth, fday, fdow]
date_features = pd.concat(frames, axis=1)
except Exception:
logger.info("Could not extract date information from the series")
return date_features


#
# Function timeparts
#

def timeparts(df, c):
r"""Extract time into its components: hour, minute, second.

Parameters
----------
df : pandas.DataFrame
Dataframe containing the column ``c``.
c : str
Name of the column in the dataframe ``df``.

Returns
-------
time_features : pandas.DataFrame
The dataframe containing the time features.
"""

ds_dt = pd.to_datetime(df[c].astype(str), format='%H:%M:%S')
time_features = pd.DataFrame()
try:
fhour = pd.Series(ds_dt.dt.hour, name='hour').astype(int)
fminute = pd.Series(ds_dt.dt.minute, name='minute').astype(int)
fsecond = pd.Series(ds_dt.dt.second, name='second').astype(int)
frames = [fhour, fminute, fsecond]
time_features = pd.concat(frames, axis=1)
except Exception:
logger.info("Could not extract time information from the series")
return time_features


#
# Function bizday
#

def bizday(df, c):
r"""Extract business day of month and week.

Parameters
----------
df : pandas.DataFrame
Dataframe containing the column ``c``.
c : str
Name of the column in the dataframe ``df``.

Returns
-------
date_features : pandas.DataFrame
The dataframe containing the date features.
"""

date_features = pd.DataFrame()
try:
date_features = dateparts(df, c)
rdate = date_features.apply(get_rdate, axis=1)
bdm = pd.Series(rdate.apply(biz_day_month), name='bizdaymonth')
bdw = pd.Series(rdate.apply(biz_day_week), name='bizdayweek')
frames = [date_features, bdm, bdw]
date_features = pd.concat(frames, axis=1)
except Exception:
logger.info("Could not extract business date information from the series")
return date_features
7 changes: 1 addition & 6 deletions alphapy/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,14 @@
from alphapy.globals import datasets
from alphapy.globals import ModelType
from alphapy.globals import Partition
from alphapy.globals import PD_INTRADAY_OFFSETS
from alphapy.globals import SSEP
from alphapy.globals import WILDCARD
from alphapy.space import Space

from datetime import datetime
from io import BytesIO
import logging
import numpy as np
import polars as pl
import re
import requests
from sklearn.preprocessing import LabelEncoder
import sys

Expand Down Expand Up @@ -121,7 +117,7 @@ def get_data(model, partition):
# assign the target column to y
df_y = df.select(target)
# encode label only for classification or system
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
y = LabelEncoder().fit_transform(df_y[target].to_numpy())
df_y = pl.DataFrame({target: y})
# drop the target from the original frame
Expand Down Expand Up @@ -178,4 +174,3 @@ def shuffle_data(model):

return model


2 changes: 1 addition & 1 deletion alphapy/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -1543,7 +1543,7 @@ def create_interactions(model, X):
logger.info("Polynomial Degree : %d", poly_degree)
if model_type == ModelType.regression:
selector = SelectPercentile(f_regression, percentile=isample_pct)
elif model_type == ModelType.classification or model_type == ModelType.system:
elif model_type == ModelType.classification:
selector = SelectPercentile(f_classif, percentile=isample_pct)
else:
raise TypeError("Unknown model type when creating interactions")
Expand Down
49 changes: 0 additions & 49 deletions alphapy/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ class ModelType(Enum):
ranking = 2
multiclass = 3
regression = 4
system = 5


#
Expand All @@ -151,54 +150,6 @@ class Objective(Enum):
minimize = 2


#
# Bar Types
#

@unique
class BarType(Enum):
"""Bar Types.

Bar Types for running models, usually translated from a normal OHLC bar to a
weighted bar based on volume, dollar amount, etc.

"""
time = 1
dollar = 2
heikinashi = 3


#
# Class Orders
#

class Orders:
"""System Order Types.

Attributes
----------
le : str
long entry
se : str
short entry
lx : str
long exit
sx : str
short exit
lh : str
long exit at the end of the holding period
sh : str
short exit at the end of the holding period

"""
le = 'le'
se = 'se'
lx = 'lx'
sx = 'sx'
lh = 'lh'
sh = 'sh'


#
# Partition Types
#
Expand Down
14 changes: 7 additions & 7 deletions alphapy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,7 @@ def make_predictions(model, algo):
model.preds[(algo, Partition.train)] = train_preds
model.preds[(algo, Partition.test)] = test_preds

if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
# Get probabilities for the positive class.
train_probas = est.predict_proba(X_train)
test_probas = est.predict_proba(X_test)
Expand Down Expand Up @@ -945,7 +945,7 @@ def predict_blend(model):
for partition in datasets.keys():
pred_set = [model.preds[key] for key, _ in model.preds.items() if partition in key]
model.preds[(blend_tag, partition)] = np.round(np.mean(pred_set, axis=0), 0).astype(int)
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
proba_set = [model.probas[key] for key, _ in model.probas.items() if partition in key]
model.probas[(blend_tag, partition)] = np.mean(proba_set, axis=0)
model.preds[(blend_tag, partition)] = np.round(model.probas[(blend_tag, partition)], 0).astype(int)
Expand Down Expand Up @@ -1048,7 +1048,7 @@ def select_best_model(model, partition):
model.best_algo = best_algo
model.estimators[best_tag] = model.estimators[best_algo]
model.preds[(best_tag, partition)] = model.preds[(best_algo, partition)]
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
model.probas[(best_tag, partition)] = model.probas[(best_algo, partition)]

# Record support vector for any recursive feature elimination
Expand Down Expand Up @@ -1173,7 +1173,7 @@ def kendall_tau(expected, predicted):
# get predictions for the given algorithm
predicted = model.preds[(algo, partition)]
# Classification Metrics
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
probas = model.probas[(algo, partition)]
try:
model.metrics[(algo, partition, 'accuracy')] = accuracy_score(expected, predicted)
Expand Down Expand Up @@ -1369,14 +1369,14 @@ def save_predictions(model, partition):
for tag_id in tag_list:
pred_name = USEP.join(['pred', datasets[partition], tag_id.lower()])
df_master = df_master.with_columns(pl.Series(pred_name, model.preds[(tag_id, partition)]))
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
prob_name = USEP.join(['prob', datasets[partition], tag_id.lower()])
df_master = df_master.with_columns(pl.Series(prob_name, model.probas[(tag_id, partition)]))

# Save ranked predictions

logger.info("Saving Ranked Predictions")
if model_type == ModelType.classification or model_type == ModelType.system:
if model_type == ModelType.classification:
prob_name = USEP.join(['prob', datasets[partition], sort_tag.lower()])
df_master = df_master.sort(prob_name, descending=True)
else:
Expand All @@ -1391,7 +1391,7 @@ def save_predictions(model, partition):
sample_spec = PSEP.join([submission_file, extension])
sample_input = SSEP.join([data_dir, sample_spec])
df_sub = pd.read_csv(sample_input)
if submit_probas and (model_type == ModelType.classification or model_type == ModelType.system):
if submit_probas and (model_type == ModelType.classification):
df_sub[df_sub.columns[1]] = model.probas[(model.best_algo, Partition.test)]
else:
df_sub[df_sub.columns[1]] = model.preds[(model.best_algo, Partition.test)]
Expand Down
8 changes: 4 additions & 4 deletions alphapy/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def plot_calibration(model, partition):
# For classification only

model_type = model.specs['model_type']
if model_type != ModelType.classification and model_type != ModelType.system:
if model_type != ModelType.classification:
logger.info('Calibration plot is for classification only')
return None

Expand Down Expand Up @@ -571,7 +571,7 @@ def plot_roc_curve(model, partition):
# For classification only

model_type = model.specs['model_type']
if model_type != ModelType.classification and model_type != ModelType.system:
if model_type != ModelType.classification:
logger.info('ROC Curves are for classification only')
return None

Expand Down Expand Up @@ -648,7 +648,7 @@ def plot_confusion_matrix(model, partition):
# For classification only

model_type = model.specs['model_type']
if model_type != ModelType.classification and model_type != ModelType.system:
if model_type != ModelType.classification:
logger.info('Confusion Matrix is for classification only')
return None

Expand Down Expand Up @@ -850,7 +850,7 @@ def plot_boundary(model, partition, f1=0, f2=1):
# For classification only

model_type = model.specs['model_type']
if model_type != ModelType.classification and model_type != ModelType.system:
if model_type != ModelType.classification:
logger.info('Boundary Plots are for classification only')
return None

Expand Down
Loading
Loading