Source code for tell.data_process_eia_930

import os

import pandas as pd

from joblib import Parallel, delayed
from .package_data import get_ba_abbreviations


[docs]def list_EIA_930_files(data_input_dir: str) -> list: """Make a list of all the file names for the EIA-930 hourly load dataset :param data_input_dir: Top-level data directory for TELL :type data_input_dir: str :return: list """ # Get a list of BA abbreviations to process: ba_name = get_ba_abbreviations() # Initiate an empty list: path_list = [] # Loop over the list and find the path for each BA in the list: for i in ba_name: path_to_check = os.path.join(data_input_dir, r'tell_raw_data', r'EIA_930', r'Balancing_Authority', f'{i}.xlsx') path_list.append(path_to_check) # Return the list: return path_list
[docs]def eia_data_subset(file_string: str, data_input_dir: str): """Extract only the columns TELL needs from the EIA-930 Excel files :param file_string: File name of EIA-930 hourly load data by BA :type file_string: str :param data_input_dir: Top-level data directory for TELL :type data_input_dir: str """ # Set the output directory based on the "data_input_dir" variable: output_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_ba_load') # If the output directory doesn't exist then create it: if not os.path.exists(output_dir): os.makedirs(output_dir) # Read in the data from the "Published Hourly Data" sheet: df = pd.read_excel(file_string, sheet_name='Published Hourly Data') # Use datetime string to get the year, month, day, and hour: df['Year'] = df['UTC time'].dt.strftime('%Y') df['Month'] = df['UTC time'].dt.strftime('%m') df['Day'] = df['UTC time'].dt.strftime('%d') df['Hour'] = df['UTC time'].dt.strftime('%H') # Only keep the columns that are needed: col_names = ['Year', 'Month', 'Day', 'Hour', 'DF', 'Adjusted D', 'Adjusted NG', 'Adjusted TI'] df = df[col_names].copy() # Rename the columns to add the units to each variable: df.rename(columns={"DF": "Forecast_Demand_MWh", "Adjusted D": "Adjusted_Demand_MWh", "Adjusted NG": "Adjusted_Generation_MWh", "Adjusted TI": "Adjusted_Interchange_MWh"}, inplace=True) # Extract the BA name from the "file_string" variable: BA_name = os.path.splitext(os.path.basename(file_string))[0] # Write the output to a .csv file: df.to_csv(os.path.join(output_dir, f'{BA_name}_hourly_load_data.csv'), index=False, header=True)
[docs]def process_eia_930_data(data_input_dir: str, n_jobs: int): """Read in list of EIA 930 files, subset the data, and save the output as a .csv file :param data_input_dir: Top-level data directory for TELL :type data_input_dir: str :param n_jobs: The maximum number of concurrently running jobs, such as the number of Python worker processes when backend=”multiprocessing” or the size of the thread-pool when backend=”threading”. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. None is a marker for ‘unset’ that will be interpreted as n_jobs=1 (sequential execution) unless the call is performed under a parallel_backend context manager that sets another value for n_jobs. :type n_jobs: int """ # Create the list of EIA-930 Excel files: list_of_files = list_EIA_930_files(data_input_dir) # Process each file in the list in parallel: Parallel(n_jobs=n_jobs)( delayed(eia_data_subset)( file_string=i, data_input_dir=data_input_dir ) for i in list_of_files )