Source code for tell.mlp_predict

import os

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from typing import Union
from .mlp_prepare_data import DatasetPredict, DefaultSettings
from .mlp_utils import normalize_prediction_data, load_predictive_models



[docs]
def predict(region: str,
            year: int,
            data_dir: str,
            datetime_field_name: str = "Time_UTC",
            save_prediction: bool = False,
            prediction_output_directory: Union[str, None] = None,
            **kwargs):
    """Generate predictions for MLP model for a target region from an input CSV file.

    :param region:                      Indicating region / balancing authority we want to train and test on.
                                        Must match with string in CSV files.
    :type region:                       str

    :param year:                        Target year to use in YYYY format.
    :type year:                         int

    :param data_dir:                    Full path to the directory that houses the input CSV files.
    :type data_dir:                     str

    :param save_prediction:             Choice to write predictions to a .csv file
    :type save_prediction:              bool

    :param prediction_output_directory: Full path to output directory where prediction files will be written.
    :type prediction_output_directory:  Union[str, None]

    :param datetime_field_name:         Name of the datetime field.
    :type datetime_field_name:          str

    :param data_column_rename_dict:     Dictionary for the field names present in the input CSV file (keys) to what the
                                        code expects them to be (values).
    :type data_column_rename_dict:      Optional[dict[str]]

    :param expected_datetime_columns:   Expected names of the date time columns in the input CSV file.
    :type expected_datetime_columns:    Optional[list[str]]

    :param hour_field_name:             Field name of the hour field in the input CSV file.
    :type hour_field_name:              Optional[str]

    :param month_field_name:            Field name of the month field in the input CSV file.
    :type month_field_name:             Optional[str]

    :param x_variables:                 Target variable list.
    :type x_variables:                  Optional[list[str]]

    :param add_dayofweek_xvars:         True if the user wishes to add weekday and holiday targets to the x variables.
    :type add_dayofweek_xvars:          Optional[bool]

    :param y_variables:                 Feature variable list.
    :type y_variables:                  Optional[list[str]]

    :param day_list:                    List of day abbreviations and their order.
    :type day_list:                     Optional[list[str]]

    :param seed_value:                  Seed value to reproduce randomization.
    :type seed_value:                   Optional[int]

    :param verbose:                     Choice to see logged outputs.
    :type verbose:                      bool

    :return:                            Prediction data frame

    """

    # get project level settings data
    settings = DefaultSettings(region=region,
                               data_dir=data_dir,
                               **kwargs)

    # set random seed
    np.random.seed(settings.seed_value)

    # prepare data for MLP model
    data_mlp = DatasetPredict(region=region,
                              year=year,
                              data_dir=data_dir,
                              datetime_field_name=datetime_field_name,
                              **kwargs)

    # load models and the normalization dictionary from file
    mlp_model, normalized_dict = load_predictive_models(region=region,
                                                        model_output_directory=settings.model_output_directory)

    # normalize model features and targets for the MLP model
    x_mlp_norm = normalize_prediction_data(data_arr=data_mlp.x_data,
                                           min_train_arr=normalized_dict["min_x_train"],
                                           max_train_arr=normalized_dict["max_x_train"])

    # run the MLP model with normalized data
    y_predicted_norm = mlp_model.predict(x_mlp_norm)

    # denormalize predicted data
    y_predicted = (y_predicted_norm * (normalized_dict["max_y_train"] - normalized_dict["min_y_train"]) + normalized_dict["min_y_train"]).round(2)

    # generate output data frame
    prediction_df = pd.DataFrame({"Time_UTC": data_mlp.df_data[settings.DATETIME_FIELD].values,
                                  "Load": y_predicted,
                                  "BA": region})

    # save the prediction to a .csv file:
    if save_prediction:

        # if the subdirectory for the year being processed doesn't exist then create it:
        if not os.path.exists(os.path.join(prediction_output_directory, str(year))):
            os.makedirs(os.path.join(prediction_output_directory, str(year)))

        prediction_df.to_csv(os.path.join(prediction_output_directory, str(year), f'{region}_'f'{year}_mlp_output.csv'), index=False)

    return prediction_df




[docs]
def predict_batch(target_region_list: list,
                  year: int,
                  data_dir: str,
                  n_jobs: int = -1,
                  datetime_field_name: str = "Time_UTC",
                  save_prediction: bool = False,
                  prediction_output_directory: Union[str, None] = None,
                  **kwargs):
    """Generate predictions for MLP model for a target region from an input CSV file for all regions
    in input list in parallel.

    :param target_region_list:          List of names indicating region / balancing authority we want to train and test
                                        on. Must match with string in CSV files.
    :type target_region_list:           list

    :param year:                        Target year to use in YYYY format.
    :type year:                         int

    :param data_dir:                    Full path to the directory that houses the input CSV files.
    :type data_dir:                     str

    :param n_jobs:                      The maximum number of concurrently running jobs, such as the number of Python
                                        worker processes when backend=”multiprocessing” or the size of the thread-pool
                                        when backend=”threading”. If -1 all CPUs are used. If 1 is given, no parallel
                                        computing code is used at all, which is useful for debugging. For n_jobs
                                        below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs
                                        but one are used. None is a marker for ‘unset’ that will be interpreted as
                                        n_jobs=1 (sequential execution) unless the call is performed under a
                                        parallel_backend context manager that sets another value for n_jobs.
    :type n_jobs:                       int

    :param datetime_field_name:         Name of the datetime field.
    :type datetime_field_name:          str

    :param save_prediction:             Choice to write predictions to a .csv file
    :type save_prediction:              bool

    :param prediction_output_directory: Full path to output directory where prediction files will be written.
    :type prediction_output_directory:  Union[str, None]

    :param data_column_rename_dict:     Dictionary for the field names present in the input CSV file (keys) to what the
                                        code expects them to be (values).
    :type data_column_rename_dict:      Optional[dict[str]]

    :param expected_datetime_columns:   Expected names of the date time columns in the input CSV file.
    :type expected_datetime_columns:    Optional[list[str]]

    :param hour_field_name:             Field name of the hour field in the input CSV file.
    :type hour_field_name:              Optional[str]

    :param month_field_name:            Field name of the month field in the input CSV file.
    :type month_field_name:             Optional[str]

    :param x_variables:                 Target variable list.
    :type x_variables:                  Optional[list[str]]

    :param add_dayofweek_xvars:         True if the user wishes to add weekday and holiday targets to the x variables.
    :type add_dayofweek_xvars:          Optional[bool]

    :param y_variables:                 Feature variable list.
    :type y_variables:                  Optional[list[str]]

    :param day_list:                    List of day abbreviations and their order.
    :type day_list:                     Optional[list[str]]

    :param seed_value:                  Seed value to reproduce randomization.
    :type seed_value:                   Optional[int]

    :param verbose:                     Choice to see logged outputs.
    :type verbose:                      bool

    :return:                            Prediction data frame

    """

    # run all regions in target list in parallel
    results = Parallel(n_jobs=n_jobs, backend="loky")(delayed(predict)(region=region,
                                                                       year=year,
                                                                       data_dir=data_dir,
                                                                       datetime_field_name=datetime_field_name,
                                                                       save_prediction=save_prediction,
                                                                       prediction_output_directory=prediction_output_directory,
                                                                       **kwargs) for region in target_region_list)

    # aggregate outputs
    for index, i in enumerate(results):

        if index == 0:
            prediction_df = i
        else:
            prediction_df = pd.concat([prediction_df, i])

    return prediction_df