#!/usr/bin/env python
# -*- coding: utf-8 -*-
from sklearn.svm import SVC
import logging
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
logging.basicConfig(level=logging.INFO)
[docs]def train(X, Y, args):
"""
Train a model given the arguments, the dataset and
the corresponding labels (ground-truth)
Parameters
----------
X : array
features of the dataset
Y : array
corresponding labels
args : dict
arguments to prepare the model
Returns
-------
model : object
trained model
"""
# SVM model selected
if args.model == "svm":
logging.info(f"Training SVM model...")
# Using predefined parameters
if args.gridsearch == "n":
logging.info(f"Using predefined parameters.")
# Training SVM model using radial kernel and predefined parameters
kernel = "rbf"
gamma = 0.0001
C = 1000
svm_model = SVC(kernel=kernel, gamma=gamma, C=C)
svm_model.fit(X, Y)
return svm_model
# Grid search
elif args.gridsearch == "y":
logging.info(f"Doing grid search, it may take a while...")
# Create the parameter grid
params_grid = [
{"kernel": ["rbf"], "gamma": [1e-2, 1e-3, 1e-4], "C": [10, 100, 1000],},
{"kernel": ["linear"], "C": [10, 100, 1000]},
{
"kernel": ["poly"],
"gamma": [1e-2, 1e-3, 1e-4],
"degree": [3, 4, 5],
"C": [10, 100, 1000],
},
{
"kernel": ["sigmoid"],
"gamma": [1e-2, 1e-3, 1e-4],
"C": [10, 100, 1000],
},
]
svm_model = GridSearchCV(SVC(), params_grid, cv=3, verbose=10, n_jobs=-1)
svm_model.fit(X, Y)
logging.info(f"Using hyperparameters: {svm_model.best_params_}")
return svm_model
# Random forest model selected
elif args.model == "rf":
logging.info(f"Training RF model...")
# Using predefined parameters
if args.gridsearch == "n":
logging.info(f"Using predefined parameters.")
# Training RF model using predefined parameters
n_estimators = 50
max_depth = 25
min_samples_split = 2
min_samples_leaf = 4
bootstrap = True
rf_model = RandomForestClassifier(
max_depth=max_depth,
n_estimators=n_estimators,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
bootstrap=bootstrap,
random_state=42,
)
rf_model.fit(X, Y)
return rf_model
# Grid search
elif args.gridsearch == "y":
logging.info(f"Doing grid search, it may take a while...")
n_estimators = [50, 75, 100]
max_depth = [10, 25, 50]
min_samples_split = [2, 4, 6]
min_samples_leaf = [1, 2, 4]
bootstrap = [True]
param_grid = {
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf,
"bootstrap": bootstrap,
}
rf = RandomForestClassifier(random_state=42)
rf_model = GridSearchCV(
estimator=rf, param_grid=param_grid, cv=3, verbose=10, n_jobs=-1
)
rf_model.fit(X, Y)
logging.info(f"Using hyperparameters: {rf_model.best_params_}")
return rf_model
[docs]def predict(X, model):
"""
Predict labels given the features and the trained model
Parameters
----------
X : array
features to predict on
model : object
trained model
Returns
-------
predictions : array
Array with the predicted labels
"""
Y_pred = model.predict(X)
return Y_pred