Source code for tell.data_process_eia_930

import os

import pandas as pd

from joblib import Parallel, delayed
from .package_data import get_ba_abbreviations


[docs]def list_EIA_930_files(data_input_dir: str) -> list:
    """Make a list of all the file names for the EIA-930 hourly load dataset

    :param data_input_dir:         Top-level data directory for TELL
    :type data_input_dir:          str

    :return:                       list

    """

    # Get a list of BA abbreviations to process:
    ba_name = get_ba_abbreviations()

    # Initiate an empty list:
    path_list = []

    # Loop over the list and find the path for each BA in the list:
    for i in ba_name:
        path_to_check = os.path.join(data_input_dir, r'tell_raw_data', r'EIA_930', r'Balancing_Authority', f'{i}.xlsx')
        path_list.append(path_to_check)

    # Return the list:
    return path_list


[docs]def eia_data_subset(file_string: str, data_input_dir: str):
    """Extract only the columns TELL needs from the EIA-930 Excel files

    :param file_string:            File name of EIA-930 hourly load data by BA
    :type file_string:             str

    :param data_input_dir:         Top-level data directory for TELL
    :type data_input_dir:          str

    """

    # Set the output directory based on the "data_input_dir" variable:
    output_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_ba_load')

    # If the output directory doesn't exist then create it:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read in the data from the "Published Hourly Data" sheet:
    df = pd.read_excel(file_string, sheet_name='Published Hourly Data')

    # Use datetime string to get the year, month, day, and hour:
    df['Year'] = df['UTC time'].dt.strftime('%Y')
    df['Month'] = df['UTC time'].dt.strftime('%m')
    df['Day'] = df['UTC time'].dt.strftime('%d')
    df['Hour'] = df['UTC time'].dt.strftime('%H')

    # Only keep the columns that are needed:
    col_names = ['Year', 'Month', 'Day', 'Hour', 'DF', 'Adjusted D', 'Adjusted NG', 'Adjusted TI']
    df = df[col_names].copy()

    # Rename the columns to add the units to each variable:
    df.rename(columns={"DF": "Forecast_Demand_MWh",
                       "Adjusted D": "Adjusted_Demand_MWh",
                       "Adjusted NG": "Adjusted_Generation_MWh",
                       "Adjusted TI": "Adjusted_Interchange_MWh"}, inplace=True)

    # Extract the BA name from the "file_string" variable:
    BA_name = os.path.splitext(os.path.basename(file_string))[0]

    # Write the output to a .csv file:
    df.to_csv(os.path.join(output_dir, f'{BA_name}_hourly_load_data.csv'), index=False, header=True)


[docs]def process_eia_930_data(data_input_dir: str, n_jobs: int):
    """Read in list of EIA 930 files, subset the data, and save the output as a .csv file

    :param data_input_dir:         Top-level data directory for TELL
    :type data_input_dir:          str

    :param n_jobs:                 The maximum number of concurrently running jobs, such as the number of Python
                                   worker processes when backend=”multiprocessing” or the size of the thread-pool
                                   when backend=”threading”. If -1 all CPUs are used. If 1 is given, no parallel
                                   computing code is used at all, which is useful for debugging. For n_jobs
                                   below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs
                                   but one are used. None is a marker for ‘unset’ that will be interpreted as
                                   n_jobs=1 (sequential execution) unless the call is performed under a
                                   parallel_backend context manager that sets another value for n_jobs.
    :type n_jobs:                  int

    """

    # Create the list of EIA-930 Excel files:
    list_of_files = list_EIA_930_files(data_input_dir)

    # Process each file in the list in parallel:
    Parallel(n_jobs=n_jobs)(
        delayed(eia_data_subset)(
            file_string=i,
            data_input_dir=data_input_dir
        ) for i in list_of_files
    )