-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
代码是对科比投篮数据的分析,省略了作者数据可视化的部分。
从数据处理,清洗,到特征选择,到模型选择,行云流水般的潇洒。
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (BaggingClassifier,
ExtraTreesClassifier,
GradientBoostingClassifier,
VotingClassifier,
RandomForestClassifier,
AdaBoostClassifier)
###################################数据处################################
######################################
# 数据预处理
######################################
pd.set_option('display.max_columns', None)
data = pd.read_csv('./data/data.csv')
data.set_index('shot_id', inplace=True)
data["action_type"] = data["action_type"].astype('object')
data["combined_shot_type"] = data["combined_shot_type"].astype('category')
data["game_event_id"] = data["game_event_id"].astype('category')
data["game_id"] = data["game_id"].astype('category')
data["period"] = data["period"].astype('object')
data["playoffs"] = data["playoffs"].astype('category')
data["season"] = data["season"].astype('category')
data["shot_made_flag"] = data["shot_made_flag"].astype('category')
data["shot_type"] = data["shot_type"].astype('category')
data["team_id"] = data["team_id"].astype('category')
unknown_mask = data['shot_made_flag'].isnull()
data_cl = data.copy() # create a copy of data frame
target = data_cl['shot_made_flag'].copy()
# Remove some columns
data_cl.drop('team_id', axis=1, inplace=True) # Always one number
data_cl.drop('lat', axis=1, inplace=True) # Correlated with loc_x
data_cl.drop('lon', axis=1, inplace=True) # Correlated with loc_y
data_cl.drop('game_id', axis=1, inplace=True) # Independent
data_cl.drop('game_event_id', axis=1, inplace=True) # Independent
data_cl.drop('team_name', axis=1, inplace=True) # Always LA Lakers
data_cl.drop('shot_made_flag', axis=1, inplace=True)
data_cl['seconds_from_period_end'] = 60 * data_cl['minutes_remaining'] + data_cl['seconds_remaining']
data_cl['last_5_sec_in_period'] = data_cl['seconds_from_period_end'] < 5
data_cl.drop('minutes_remaining', axis=1, inplace=True)
data_cl.drop('seconds_remaining', axis=1, inplace=True)
data_cl.drop('seconds_from_period_end', axis=1, inplace=True)
## Matchup - (away/home)
data_cl['home_play'] = data_cl['matchup'].str.contains('vs').astype('int')
data_cl.drop('matchup', axis=1, inplace=True)
# Game date
data_cl['game_date'] = pd.to_datetime(data_cl['game_date'])
data_cl['game_year'] = data_cl['game_date'].dt.year
data_cl['game_month'] = data_cl['game_date'].dt.month
data_cl.drop('game_date', axis=1, inplace=True)
# Loc_x, and loc_y binning
data_cl['loc_x'] = pd.cut(data_cl['loc_x'], 25)
data_cl['loc_y'] = pd.cut(data_cl['loc_y'], 25)
# Replace 20 least common action types with value 'Other'
rare_action_types = data_cl['action_type'].value_counts().sort_values().index.values[:20]
data_cl.loc[data_cl['action_type'].isin(rare_action_types), 'action_type'] = 'Other'
categorial_cols = [
'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
'game_month', 'opponent', 'loc_x', 'loc_y']
for cc in categorial_cols:
dummies = pd.get_dummies(data_cl[cc])
dummies = dummies.add_prefix("{}#".format(cc))
data_cl.drop(cc, axis=1, inplace=True)
data_cl = data_cl.join(dummies)
# 异常点检测方法
def detect_outliers(series, whis=1.5):
q75, q25 = np.percentile(series, [75 ,25])
iqr = q75 - q25
return ~((series - series.median()).abs() <= (whis * iqr))
# Separate dataset for validation
data_submit = data_cl[unknown_mask]
# 训练数据
X = data_cl[~unknown_mask]
Y = target[~unknown_mask]
################################### Feature Selection###################
#################################
#RandomForestClassifier 来选择特征
###############################
threshold = 0.90
vt = VarianceThreshold().fit(X)
feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)]
feat_var_threshold
model = RandomForestClassifier()
model.fit(X, Y)
feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20).index
#################################
# Univariate feature selection
#################################
X_minmax = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, Y)
feature_scoring = pd.DataFrame({
'feature': X.columns,
'score': X_scored.scores_
})
feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(20)['feature'].values
feat_scored_20
#################################
# Recursive Feature Elimination
#################################
rfe = RFE(LogisticRegression(), 20)
rfe.fit(X, Y)
feature_rfe_scoring = pd.DataFrame({
'feature': X.columns,
'score': rfe.ranking_
})
feat_rfe_20 = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values
feat_rfe_20
###############################
# 合并所有特征选择方法的结果
################################
features = np.hstack([
feat_var_threshold,
feat_imp_20,
feat_scored_20,
feat_rfe_20
])
features = np.unique(features)
print('Final features set:\n')
for f in features:
print("\t-{}".format(f))
################################
# clearn data
###############################
data_cl = data_cl.ix[:, features]
data_submit = data_submit.ix[:, features]
X = X.ix[:, features]
print('Clean dataset shape: {}'.format(data_cl.shape))
print('Subbmitable dataset shape: {}'.format(data_submit.shape))
print('Train features shape: {}'.format(X.shape))
print('Target label shape: {}'. format(Y.shape))
#################################
# PCA
#################################
components = 8
pca = PCA(n_components=components).fit(X)
pca_variance_explained_df = pd.DataFrame({
"component": np.arange(1, components+1),
"variance_explained": pca.explained_variance_ratio_
})
ax = sns.barplot(x='component', y='variance_explained', data=pca_variance_explained_df)
ax.set_title("PCA - Variance explained")
plt.show()
###################################
# 评估函数
###################################
seed = 7
processors=1
num_folds=3
num_instances=len(X)
scoring='log_loss'
kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)
#################################模型选则##############################
#################################
# 常用模型
#################################
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('K-NN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
#models.append(('SVC', SVC(probability=True)))
# Evaluate each model in turn
results = []
names = []
for name, model in models:
cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
results.append(cv_results)
names.append(name)
print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
##################################
# Bootstrap Aggregation
###################################
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
#####################################
# Random Forest
#####################################
num_trees = 100
num_features = 10
model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
#####################################
# extra tree
#######################################
num_trees = 100
num_features = 10
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=num_features)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
#######################################
# AdaBoost
######################################
model = AdaBoostClassifier(n_estimators=100, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
############################################
# Stochastic Gradient Boosting
#############################################
model = GradientBoostingClassifier(n_estimators=100, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
############################参数寻找################################
#####################################
# Logistic 参数寻找
######################################
lr_grid = GridSearchCV(
estimator = LogisticRegression(random_state=seed),
param_grid = {
'penalty': ['l1', 'l2'],
'C': [0.001, 0.01, 1, 10, 100, 1000]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
lr_grid.fit(X, Y)
print(lr_grid.best_score_)
print(lr_grid.best_params_)
#########################################
# LinearDiscriminant
########################################
lda_grid = GridSearchCV(
estimator = LinearDiscriminantAnalysis(),
param_grid = {
'solver': ['lsqr'],
'shrinkage': [0, 0.25, 0.5, 0.75, 1],
'n_components': [None, 2, 5, 10]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
lda_grid.fit(X, Y)
print(lda_grid.best_score_)
print(lda_grid.best_params_)
#######################################
# KNN
############################################
knn_grid = GridSearchCV(
estimator = Pipeline([
('min_max_scaler', MinMaxScaler()),
('knn', KNeighborsClassifier())
]),
param_grid = {
'knn__n_neighbors': [25],
'knn__algorithm': ['ball_tree'],
'knn__leaf_size': [2, 3, 4],
'knn__p': [1]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
knn_grid.fit(X, Y)
print(knn_grid.best_score_)
print(knn_grid.best_params_)
###############################################
# 寻找随机森林参数
##############################################
rf_grid = GridSearchCV(
estimator = RandomForestClassifier(warm_start=True, random_state=seed),
param_grid = {
'n_estimators': [100, 200],
'criterion': ['gini', 'entropy'],
'max_features': [18, 20],
'max_depth': [8, 10],
'bootstrap': [True]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
rf_grid.fit(X, Y)
print(rf_grid.best_score_)
print(rf_grid.best_params_)
############################################
# AdaBoost 参数寻找
##############################################
ada_grid = GridSearchCV(
estimator = AdaBoostClassifier(random_state=seed),
param_grid = {
'algorithm': ['SAMME', 'SAMME.R'],
'n_estimators': [10, 25, 50],
'learning_rate': [1e-3, 1e-2, 1e-1]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
ada_grid.fit(X, Y)
print(ada_grid.best_score_)
print(ada_grid.best_params_)
#################################################
# GradientBoosting
#################################################
gbm_grid = GridSearchCV(
estimator = GradientBoostingClassifier(warm_start=True, random_state=seed),
param_grid = {
'n_estimators': [100, 200],
'max_depth': [2, 3, 4],
'max_features': [10, 15, 20],
'learning_rate': [1e-1, 1]
},
cv = kfold,
scoring = scoring,
n_jobs = processors)
gbm_grid.fit(X, Y)
print(gbm_grid.best_score_)
print(gbm_grid.best_params_)
#################################################
# 组合上面选择的模型
#################################################
estimators = []
estimators.append(('lr', LogisticRegression(penalty='l2', C=1)))
estimators.append(('gbm', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, max_features=15, warm_start=True, random_state=seed)))
estimators.append(('rf', RandomForestClassifier(bootstrap=True, max_depth=8, n_estimators=200, max_features=20, criterion='entropy', random_state=seed)))
estimators.append(('ada', AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1e-2, n_estimators=10, random_state=seed)))
# create the ensemble model
ensemble = VotingClassifier(estimators, voting='soft', weights=[2,3,3,1])
results = cross_val_score(ensemble, X, Y, cv=kfold, scoring=scoring,n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
###############################################
# 预测
###############################################
model = ensemble
model.fit(X, Y)
preds = model.predict_proba(data_submit)
submission = pd.DataFrame()
submission["shot_id"] = data_submit.index
submission["shot_made_flag"]= preds[:,0]
submission.to_csv("sub.csv",index=False)
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels