Source code for tell.mlp_utils

import os
import pkg_resources
from typing import Union

import joblib
import numpy as np
import pandas as pd
import sklearn
import yaml
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error


[docs]def get_balancing_authority_to_model_dict(): """Return a list of balancing authority abbreviations.""" ba_file = pkg_resources.resource_filename("tell", "data/balancing_authority_modeled.yml") # read into a dictionary with open(ba_file, 'r') as yml: return yaml.load(yml, Loader=yaml.FullLoader)
[docs]def normalize_prediction_data(data_arr: np.ndarray, min_train_arr: np.ndarray, max_train_arr: np.ndarray) -> np.ndarray: """Normalize target data using exising min, max for training data. :param data_arr: Array of target data :type data_arr: np.ndarray :param min_train_arr: Array of previously trained minimum target data :type min_train_arr: np.ndarray :param max_train_arr: Array of previously trained minimum target data :type max_train_arr: np.ndarray """ return np.divide((data_arr - min_train_arr), (max_train_arr - min_train_arr))
[docs]def normalize_features(x_train: np.ndarray, x_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> dict: """Normalize the features and targets of the model. :param x_train: Training features :type x_train: np.ndarray :param x_test: Test features :type x_test: np.ndarray :param y_train: Training targets :type y_train: np.ndarray :param y_test: Training targets :type y_test: np.ndarray :return: Dictionary of scaled features """ # get the min and max of each variable in each array min_x_train = np.min(x_train, axis=0) max_x_train = np.max(x_train, axis=0) min_y_train = np.min(y_train, axis=0) max_y_train = np.max(y_train, axis=0) # normalize x_train_norm = np.divide((x_train - min_x_train), (max_x_train - min_x_train)) x_test_norm = np.divide((x_test - min_x_train), (max_x_train - min_x_train)) y_train_norm = np.divide((y_train - min_y_train), (max_y_train - min_y_train)) if y_test is not None: y_test_norm = np.divide((y_test - min_y_train), (max_y_train - min_y_train)) else: y_test_norm = None dict_out = { "min_x_train": min_x_train, "max_x_train": max_x_train, "min_y_train": min_y_train, "max_y_train": max_y_train, "x_train_norm": x_train_norm, "y_train_norm": y_train_norm, "x_test_norm": x_test_norm, "y_test_norm": y_test_norm } return dict_out
[docs]def denormalize_features(region: str, normalized_dict: dict, y_predicted_normalized: np.ndarray, y_comparison: np.ndarray, datetime_arr: np.ndarray) -> pd.DataFrame: """Function to denormalize the predictions of the model. :param region: Indicating region / balancing authority we want to train and test on. Must match with string in CSV files. :type region: str :param normalized_dict: Dictionary output from normalization function. :type normalized_dict: dict :param y_predicted_normalized: Normalized predictions over the test set. :type y_predicted_normalized: np.ndarray :param y_comparison: Testing data to compare predictions to. :type y_comparison: np.ndarray :param datetime_arr: Array of datetimes corresponding to the predictions. :type datetime_arr: np.ndarray :return: Denormalized predictions """ # denormalize predicted Y y_p = y_predicted_normalized * (normalized_dict["max_y_train"] - normalized_dict["min_y_train"]) + normalized_dict["min_y_train"] # create data frame with datetime attached df = pd.DataFrame({"datetime": datetime_arr, "predictions": y_p, "ground_truth": np.squeeze(y_comparison)}) # add in region field df["region"] = region return df
[docs]def pickle_model(region: str, model_object: object, model_name: str, model_output_directory: Union[str, None]): """Pickle model to file using joblib. Version of scikit-learn is included in the file name as a compatible version is required to reload the data safely. :param region: Indicating region / balancing authority we want to train and test on. Must match with string in CSV files. :type region: str :param model_object: scikit-learn model object. :type model_object: object :param model_name: Name of sklearn model. :type model_name: str :param model_output_directory: Full path to output directory where model file will be written. :type model_output_directory: str """ # build output file name basename = f"{region}_{model_name}_scikit-learn-version-{sklearn.__version__}.joblib" output_file = os.path.join(model_output_directory, basename) # dump model to file joblib.dump(model_object, output_file)
[docs]def load_model(model_file: str) -> object: """Load pickled model from file using joblib. Version of scikit-learn is included in the file name as a compatible version is required to reload the data safely. :param model_file: Full path with filename an extension to the joblib pickled model file. :type model_file: str :return: Model as an object. """ # get version of scikit-learn and compare with the model from file to ensure compatibility sk_model_version = os.path.splitext(model_file)[0].split('-')[-1] # get version of scikit-learn being used during runtime sk_run_version = sklearn.__version__ if sk_model_version != sk_run_version: msg = f"Incompatible scikit-learn version for saved model ({sk_model_version}) and current version ({sk_run_version})." raise AssertionError(msg) # load model from return joblib.load(model_file)
[docs]def load_predictive_models(region: str, model_output_directory: Union[str, None]): """Load predictive models and the normalization dictionary based off of what is stored in the package or from a user provided directory. The scikit-learn version being used must match the one the model was generated with. :param region: Indicating region / balancing authority we want to train and test on. Must match with string in CSV files. :type region: str :param model_output_directory: Full path to output directory where model file will be written. :type model_output_directory: Union[str, None] :return: [0] MLP model [1] normalization dictionary """ # current scikit-learn version sk_version = sklearn.__version__ # load the models from the package data if no alternate directory is passed if len(model_output_directory) == 0: # get default model file mlp_model_id = "multi-layer-perceptron-regressor" mlp_model_file = os.path.join("data", "models", f"{region}_{mlp_model_id}_scikit-learn-version-{sk_version}.joblib") mlp_model_path = pkg_resources.resource_filename("tell", mlp_model_file) else: # get provided model file mlp_model_id = "multi-layer-perceptron-regressor" mlp_model_file = f"{region}_{mlp_model_id}_scikit-learn-version-{sk_version}.joblib" mlp_model_path = os.path.join(model_output_directory, mlp_model_file) # load the mlp model mlp_model = load_model(model_file=mlp_model_path) # load the normalization dictionary normalized_dict_file = os.path.join(model_output_directory, f"{region}_normalization_dict.joblib") normalization_dict = load_normalization_dict(normalized_dict_file) return mlp_model, normalization_dict
[docs]def pickle_normalization_dict(region: str, normalization_dict: dict, model_output_directory: Union[str, None]): """Pickle model to file using joblib. Version of scikit-learn is included in the file name as a compatible version is required to reload the data safely. :param region: Indicating region / balancing authority we want to train and test on. Must match with string in CSV files. :type region: str :param normalization_dict: Dictionary of normalization data :type normalization_dict: dict :param model_output_directory: Full path to output directory where model file will be written. :type model_output_directory: str """ # build output file name basename = f"{region}_normalization_dict.joblib" output_file = os.path.join(model_output_directory, basename) # dump dictionary to file joblib.dump(value=normalization_dict, filename=output_file, compress=5)
[docs]def load_normalization_dict(file: str) -> dict: """Load pickled model from file using joblib. :param file: Full path with file name and extension to the pickled normalization dictionary :type file: str :return: Normalization dictionary """ return joblib.load(file)
[docs]def evaluate(region: str, y_predicted: np.ndarray, y_comparison: np.ndarray) -> pd.DataFrame: """Evaluation of model performance using the predicted compared to the test data. :param region: Indicating region / balancing authority we want to train and test on. Must match with string in CSV files. :type region: str :param y_predicted: Predicted Y result array. :type y_predicted: np.ndarray :param y_comparison: Comparison test data for Y array. :type y_comparison: np.ndarray :return: Data frame of stats. """ # remove all the no data values in the comparison test data y_comparison = y_comparison.squeeze() y_comp_clean_idx = np.where(~np.isnan(y_comparison)) y_comp = y_comparison[y_comp_clean_idx].squeeze() # get matching predicted data y_pred = y_predicted[y_comp_clean_idx] # absolute RMSE rms_abs = np.sqrt(mean_squared_error(y_pred, y_comp)) # RMSE normalized rms_norm = rms_abs / np.mean(y_comp) # mean absolute percentage error mape = mean_absolute_percentage_error(y_pred, y_comp) # R2 r2_val = r2_score(y_pred, y_comp) stats_dict = {"BA": [region], "RMS_ABS": [rms_abs], "RMS_NORM": [rms_norm], "MAPE": [mape], "R2": [r2_val]} return pd.DataFrame(stats_dict)