Source code for tell.mlp_utils

import os
import pkg_resources
from typing import Union

import joblib
import numpy as np
import pandas as pd
import sklearn
import yaml
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error


[docs]def get_balancing_authority_to_model_dict():
    """Return a list of balancing authority abbreviations."""

    ba_file = pkg_resources.resource_filename("tell", "data/balancing_authority_modeled.yml")

    # read into a dictionary
    with open(ba_file, 'r') as yml:
        return yaml.load(yml, Loader=yaml.FullLoader)


[docs]def normalize_prediction_data(data_arr: np.ndarray,
                              min_train_arr: np.ndarray,
                              max_train_arr: np.ndarray) -> np.ndarray:
    """Normalize target data using exising min, max for training data.

    :param data_arr:                        Array of target data
    :type data_arr:                         np.ndarray

    :param min_train_arr:                   Array of previously trained minimum target data
    :type min_train_arr:                    np.ndarray

    :param max_train_arr:                   Array of previously trained minimum target data
    :type max_train_arr:                    np.ndarray

    """

    return np.divide((data_arr - min_train_arr), (max_train_arr - min_train_arr))


[docs]def normalize_features(x_train: np.ndarray,
                       x_test: np.ndarray,
                       y_train: np.ndarray,
                       y_test: np.ndarray) -> dict:
    """Normalize the features and targets of the model.

    :param x_train:                         Training features
    :type x_train:                          np.ndarray

    :param x_test:                          Test features
    :type x_test:                           np.ndarray

    :param y_train:                         Training targets
    :type y_train:                          np.ndarray

    :param y_test:                          Training targets
    :type y_test:                           np.ndarray

    :return:                                Dictionary of scaled features

    """

    # get the min and max of each variable in each array
    min_x_train = np.min(x_train, axis=0)
    max_x_train = np.max(x_train, axis=0)
    min_y_train = np.min(y_train, axis=0)
    max_y_train = np.max(y_train, axis=0)

    # normalize
    x_train_norm = np.divide((x_train - min_x_train), (max_x_train - min_x_train))
    x_test_norm = np.divide((x_test - min_x_train), (max_x_train - min_x_train))
    y_train_norm = np.divide((y_train - min_y_train), (max_y_train - min_y_train))

    if y_test is not None:
        y_test_norm = np.divide((y_test - min_y_train), (max_y_train - min_y_train))
    else:
        y_test_norm = None

    dict_out = {
        "min_x_train": min_x_train,
        "max_x_train": max_x_train,
        "min_y_train": min_y_train,
        "max_y_train": max_y_train,
        "x_train_norm": x_train_norm,
        "y_train_norm": y_train_norm,
        "x_test_norm": x_test_norm,
        "y_test_norm": y_test_norm
    }

    return dict_out


[docs]def denormalize_features(region: str,
                         normalized_dict: dict,
                         y_predicted_normalized: np.ndarray,
                         y_comparison: np.ndarray,
                         datetime_arr: np.ndarray) -> pd.DataFrame:
    """Function to denormalize the predictions of the model.

    :param region:                              Indicating region / balancing authority we want to train and test on.
                                                Must match with string in CSV files.
    :type region:                               str

    :param normalized_dict:                     Dictionary output from normalization function.
    :type normalized_dict:                      dict

    :param y_predicted_normalized:              Normalized predictions over the test set.
    :type y_predicted_normalized:               np.ndarray

    :param y_comparison:                        Testing data to compare predictions to.
    :type y_comparison:                         np.ndarray

    :param datetime_arr:                        Array of datetimes corresponding to the predictions.
    :type datetime_arr:                         np.ndarray

    :return:                                    Denormalized predictions

    """

    # denormalize predicted Y
    y_p = y_predicted_normalized * (normalized_dict["max_y_train"] - normalized_dict["min_y_train"]) + normalized_dict["min_y_train"]

    # create data frame with datetime attached
    df = pd.DataFrame({"datetime": datetime_arr, "predictions": y_p, "ground_truth": np.squeeze(y_comparison)})

    # add in region field
    df["region"] = region

    return df


[docs]def pickle_model(region: str,
                 model_object: object,
                 model_name: str,
                 model_output_directory: Union[str, None]):
    """Pickle model to file using joblib.  Version of scikit-learn is included in the file name as a compatible
    version is required to reload the data safely.

    :param region:                          Indicating region / balancing authority we want to train and test on.
                                            Must match with string in CSV files.
    :type region:                           str

    :param model_object:                    scikit-learn model object.
    :type model_object:                     object

    :param model_name:                      Name of sklearn model.
    :type model_name:                       str

    :param model_output_directory:          Full path to output directory where model file will be written.
    :type model_output_directory:           str

    """

    # build output file name
    basename = f"{region}_{model_name}_scikit-learn-version-{sklearn.__version__}.joblib"
    output_file = os.path.join(model_output_directory, basename)

    # dump model to file
    joblib.dump(model_object, output_file)


[docs]def load_model(model_file: str) -> object:
    """Load pickled model from file using joblib.  Version of scikit-learn is included in the file name as a compatible
    version is required to reload the data safely.

    :param model_file:                  Full path with filename an extension to the joblib pickled model file.
    :type model_file:                   str

    :return:                            Model as an object.

    """

    # get version of scikit-learn and compare with the model from file to ensure compatibility
    sk_model_version = os.path.splitext(model_file)[0].split('-')[-1]

    # get version of scikit-learn being used during runtime
    sk_run_version = sklearn.__version__

    if sk_model_version != sk_run_version:
        msg = f"Incompatible scikit-learn version for saved model ({sk_model_version}) and current version ({sk_run_version})."
        raise AssertionError(msg)

    # load model from
    return joblib.load(model_file)


[docs]def load_predictive_models(region: str,
                           model_output_directory: Union[str, None]):
    """Load predictive models and the normalization dictionary based off of what is stored in the package or from a
    user provided directory. The scikit-learn version being used must match the one the model was generated with.

    :param region:                          Indicating region / balancing authority we want to train and test on.
                                            Must match with string in CSV files.
    :type region:                           str

    :param model_output_directory:          Full path to output directory where model file will be written.
    :type model_output_directory:           Union[str, None]

    :return:                                [0] MLP model
                                            [1] normalization dictionary

    """

    # current scikit-learn version
    sk_version = sklearn.__version__

    # load the models from the package data if no alternate directory is passed
    if len(model_output_directory) == 0:

        # get default model file
        mlp_model_id = "multi-layer-perceptron-regressor"
        mlp_model_file = os.path.join("data", "models", f"{region}_{mlp_model_id}_scikit-learn-version-{sk_version}.joblib")
        mlp_model_path = pkg_resources.resource_filename("tell", mlp_model_file)

    else:

        # get provided model file
        mlp_model_id = "multi-layer-perceptron-regressor"
        mlp_model_file = f"{region}_{mlp_model_id}_scikit-learn-version-{sk_version}.joblib"
        mlp_model_path = os.path.join(model_output_directory, mlp_model_file)

    # load the mlp model
    mlp_model = load_model(model_file=mlp_model_path)

    # load the normalization dictionary
    normalized_dict_file = os.path.join(model_output_directory, f"{region}_normalization_dict.joblib")
    normalization_dict = load_normalization_dict(normalized_dict_file)

    return mlp_model, normalization_dict


[docs]def pickle_normalization_dict(region: str,
                              normalization_dict: dict,
                              model_output_directory: Union[str, None]):
    """Pickle model to file using joblib.  Version of scikit-learn is included in the file name as a compatible
    version is required to reload the data safely.

    :param region:                          Indicating region / balancing authority we want to train and test on.
                                            Must match with string in CSV files.
    :type region:                           str

    :param normalization_dict:              Dictionary of normalization data
    :type normalization_dict:               dict

    :param model_output_directory:          Full path to output directory where model file will be written.
    :type model_output_directory:           str

    """

    # build output file name
    basename = f"{region}_normalization_dict.joblib"
    output_file = os.path.join(model_output_directory, basename)

    # dump dictionary to file
    joblib.dump(value=normalization_dict, filename=output_file, compress=5)


[docs]def load_normalization_dict(file: str) -> dict:
    """Load pickled model from file using joblib.

    :param file:                            Full path with file name and extension to the pickled normalization
                                            dictionary
    :type file:                             str

    :return:                                Normalization dictionary

    """

    return joblib.load(file)


[docs]def evaluate(region: str,
             y_predicted: np.ndarray,
             y_comparison: np.ndarray) -> pd.DataFrame:
    """Evaluation of model performance using the predicted compared to the test data.

    :param region:                      Indicating region / balancing authority we want to train and test on.
                                        Must match with string in CSV files.
    :type region:                       str

    :param y_predicted:                 Predicted Y result array.
    :type y_predicted:                  np.ndarray

    :param y_comparison:                Comparison test data for Y array.
    :type y_comparison:                 np.ndarray

    :return:                            Data frame of stats.

    """

    # remove all the no data values in the comparison test data
    y_comparison = y_comparison.squeeze()
    y_comp_clean_idx = np.where(~np.isnan(y_comparison))

    y_comp = y_comparison[y_comp_clean_idx].squeeze()

    # get matching predicted data
    y_pred = y_predicted[y_comp_clean_idx]

    # absolute RMSE
    rms_abs = np.sqrt(mean_squared_error(y_pred, y_comp))

    # RMSE normalized
    rms_norm = rms_abs / np.mean(y_comp)

    # mean absolute percentage error
    mape = mean_absolute_percentage_error(y_pred, y_comp)

    # R2
    r2_val = r2_score(y_pred, y_comp)

    stats_dict = {"BA": [region],
                  "RMS_ABS": [rms_abs],
                  "RMS_NORM": [rms_norm],
                  "MAPE": [mape],
                  "R2": [r2_val]}

    return pd.DataFrame(stats_dict)