configuration.py

import os
import pickle

import numpy as np

from costcla import CostSensitiveDecisionTreeClassifier, CostSensitiveRandomForestClassifier, metrics as cost_metrics
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from cost_sensitive import Cost, cost_sensitive_data_re_balance
from imbalanced_test import smote, tomek_links, random_over_sampler

IMBALANCE_OPTION_NONE = "kImbalanceNone"
IMBALANCE_OPTION_SMOTE = "kImbalanceSMOTE"
IMBALANCE_OPTION_TOMEK_OVERSAMPLE = "kImbalanceTomekOversample"

COST_OPTION_NONE = "kCostOptionNone"
COST_OPTION_REJECTION_SAMPLING = "kCostRejectionSample"
COST_OPTION_MODEL = "kCostModel"

EXPLAIN_OPTION_WHITE_BOX = "kExplainableWhiteBox"
EXPLAIN_OPTION_BLACK_BOX = "kExplainableBlackBox"


#  Cost models documentation: http://albahnsen.github.io/CostSensitiveClassification/Models.html

def not_yet_implemented(x, y):
    """
    This method is a not yet implemented placeholder to avoid crashes.
    It simply returns its arguments unchanged.
    :param x: array-like
    :param y: array-like
    :return: x, y
    """
    return x, y


class ModelConfiguration:
    """
    This class holds the options and serves as a dataset transformer and a model factory helper.
    """
    cost = Cost(0, 0)
    imbalance_option = ""
    cost_option = ""
    explain_option = ""

    def __init__(self, cost: Cost, imbalance_option: str = IMBALANCE_OPTION_SMOTE,
                 cost_option: str = COST_OPTION_MODEL, explain_option: str = EXPLAIN_OPTION_WHITE_BOX):
        self.cost = cost

        # Raise error, unrecognized value in imbalance option
        if imbalance_option not in [IMBALANCE_OPTION_NONE, IMBALANCE_OPTION_SMOTE, IMBALANCE_OPTION_TOMEK_OVERSAMPLE]:
            raise ValueError("Unexpected IMBALANCE option specified.")

        self.imbalance_option = imbalance_option

        # Raise error, unrecognized value in cost option
        if cost_option not in [COST_OPTION_NONE, COST_OPTION_REJECTION_SAMPLING, COST_OPTION_MODEL]:
            raise ValueError("Unexpected COST option specified.")

        self.cost_option = cost_option

        # Raise error, unrecognized value in explain option
        if explain_option not in [EXPLAIN_OPTION_WHITE_BOX, EXPLAIN_OPTION_BLACK_BOX]:
            raise ValueError("Unexpected EXPLAIN option specified.")

        self.explain_option = explain_option

    def transform_dataset(self, x, y):
        if self.imbalance_option == IMBALANCE_OPTION_SMOTE:
            # call smote func
            x, y = smote(x, y)
        elif self.imbalance_option == IMBALANCE_OPTION_TOMEK_OVERSAMPLE:
            # do tomek links + random over sampler
            x, y = tomek_links(x, y)
            x, y = random_over_sampler(x, y)

        if self.cost_option == COST_OPTION_REJECTION_SAMPLING:
            x, y = cost_sensitive_data_re_balance(x, y, self.cost)

        return x, y

    def __get_model(self):

        if self.explain_option == EXPLAIN_OPTION_WHITE_BOX:
            if self.cost_option == COST_OPTION_MODEL:
                return CostSensitiveDecisionTreeClassifier(max_depth=5)
            else:
                return DecisionTreeClassifier(max_depth=5)
        else:
            if self.cost_option == COST_OPTION_MODEL:
                return CostSensitiveRandomForestClassifier()
            else:
                return RandomForestClassifier()

    def create_model(self):
        return MetaModel(configuration=self, ml_model=self.__get_model())


class MetaModel:
    """
    This class encapsulates an ml model generated by the Configuration and wraps common methods.
    """
    configuration = ModelConfiguration(Cost(0, 0))
    ml_model = None

    def __init__(self, configuration, ml_model):
        self.configuration = configuration
        self.ml_model = ml_model

    def fit(self, x, y):
        if isinstance(self.ml_model, CostSensitiveDecisionTreeClassifier):
            costs = []

            for current_y in y:
                costs_array = self.configuration.cost.costcla_cost_array(current_y)
                costs.append(costs_array)

            costs = np.asarray(costs)

            self.ml_model.fit(x, y, cost_mat=costs)
        elif isinstance(self.ml_model, CostSensitiveRandomForestClassifier):
            costs = []

            for current_y in y:
                costs_array = self.configuration.cost.costcla_cost_array(current_y)
                costs.append(costs_array)

            costs = np.asarray(costs)

            self.ml_model.fit(x, y, cost_mat=costs)
        elif isinstance(self.ml_model, DecisionTreeClassifier):
            self.ml_model.fit(x, y)
        elif isinstance(self.ml_model, RandomForestClassifier):
            self.ml_model.fit(x, y)
        else:  # try to call fit unsafely, will raise error with wrong class
            self.ml_model.fit(x, y)

    def predict(self, x):
        if isinstance(self.ml_model, CostSensitiveDecisionTreeClassifier):
            return self.ml_model.predict(x)
        elif isinstance(self.ml_model, CostSensitiveRandomForestClassifier):
            return self.ml_model.predict(x)
        elif isinstance(self.ml_model, DecisionTreeClassifier):
            return self.ml_model.predict(x)
        elif isinstance(self.ml_model, RandomForestClassifier):
            return self.ml_model.predict(x)
        else:  # try to call predict unsafely, will raise error with wrong class
            return self.ml_model.predict(x)

    def print_metrics(self, y_test, y_pred):
        if self.configuration.cost_option == COST_OPTION_MODEL:
            costs = []

            for current_y in y_test:
                costs_array = self.configuration.cost.costcla_cost_array(current_y)
                costs.append(costs_array)

            costs = np.asarray(costs)

            cost_loss = cost_metrics.cost_loss(y_test, y_pred, costs)
            print("\tCost loss: %f" % cost_loss)

            bin_class_metrics = cost_metrics.binary_classification_metrics(y_test, y_pred, y_pred)
            print("\tBinary classification metrics:", bin_class_metrics)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)

        print("\tAccuracy: %f" % accuracy)
        print("\tRecall: %f" % recall)
        print("\tPrecision: %f" % precision)
        print("\tF1: %f" % f1)

    def model_name(self):
        name = ""

        if isinstance(self.ml_model, CostSensitiveDecisionTreeClassifier):
            name = "cost_decision_tree_classifier"
        elif isinstance(self.ml_model, CostSensitiveRandomForestClassifier):
            name = "cost_random_forest_classifier"
        elif isinstance(self.ml_model, DecisionTreeClassifier):
            if self.configuration.cost_option == COST_OPTION_NONE:
                name = "no_cost_decision_tree_classifier"
            else:
                name = "cost_resample_decision_tree_classifier"
        elif isinstance(self.ml_model, RandomForestClassifier):
            if self.configuration.cost_option == COST_OPTION_NONE:
                name = "no_cost_random_forest_classifier"
            else:
                name = "cost_resample_random_forest_classifier"
        else:  # unknown model
            name = "unknown"

        imbalance_method = ""

        if self.configuration.imbalance_option == IMBALANCE_OPTION_SMOTE:
            imbalance_method = "smote_"
        elif self.configuration.imbalance_option == IMBALANCE_OPTION_TOMEK_OVERSAMPLE:
            imbalance_method = "tomek_oversample_"
        else:
            imbalance_method = "no_imbalance_"

        model_name = imbalance_method + name
        return model_name

    def save_model(self, path):
        model_name = self.model_name() + ".model"
        file_name = os.path.join(path, model_name)
        pickle.dump(self.ml_model, open(file_name, "wb"))
        return


def ad_hoc_try_logistic_reg(imbalance_option: str, x_train, y_train, x_test, y_test, path: str):
    name = "no_imbalance_no_cost_logistic_regression_classifier"

    if imbalance_option == IMBALANCE_OPTION_SMOTE:
        name = "smote_no_cost_logistic_regression_classifier"
        # call smote func
        x_train, y_train = smote(x_train, y_train)
    elif imbalance_option == IMBALANCE_OPTION_TOMEK_OVERSAMPLE:
        name = "tomek_oversample_no_cost_logistic_regression_classifier"
        # do tomek links + random under sampler
        x_train, y_train = tomek_links(x_train, y_train)
        x_train, y_train = random_over_sampler(x_train, y_train)

    model = LogisticRegression()
    print("Created ad-hoc model: ", name, ".")
    model.fit(x_train, y_train)

    print("Training ML model...")
    y_pred = model.predict(x_test)

    print("Evaluating ML model...")
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    print("\tAccuracy: %f" % accuracy)
    print("\tRecall: %f" % recall)
    print("\tPrecision: %f" % precision)
    print("\tF1: %f" % f1)

    model_name = name + ".model"
    file_name = os.path.join(path, model_name)
    pickle.dump(model, open(file_name, "wb"))

    return