Source code for tell.data_process_compile

import os

import pandas as pd
import numpy as np

from .package_data import get_ba_abbreviations



[docs]
def compile_data(start_year: int, end_year: int, data_input_dir: str):
    """Merge the load, population, and climate data into a single .csv file for each BA

    :param start_year:                         Year to start process; four digit year (e.g., 1990)
    :type start_year:                          int

    :param end_year:                           Year to end process; four digit year (e.g., 1990)
    :type end_year:                            int

    :param data_input_dir:                     Top-level data directory for TELL
    :type data_input_dir:                      str

    """

    # Get a list of BA abbreviations to process:
    ba_name = get_ba_abbreviations()

    # Set the input directories for each variable:
    load_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_ba_load')
    population_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_population')
    weather_dir = os.path.join(data_input_dir, r'sample_forcing_data', r'historical_weather')

    # Set the output directory based on the "data_input_dir" variable:
    output_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'compiled_historical_data')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Loop over the list of BAs to process:
    for i in ba_name:

        # Check to make sure all of the requisite data exist for that BA:
        all_data_present = False
        if os.path.isfile(os.path.join(load_dir, f"{i}_hourly_load_data.csv")) is True:
            if os.path.isfile(os.path.join(population_dir, f"{i}_hourly_population_data.csv")) is True:
                if os.path.isfile(os.path.join(weather_dir, f"{i}_WRF_Hourly_Mean_Meteorology_2019.csv")) is True:
                    all_data_present = True

        if all_data_present is True:
            # Read in the historical load and population data for that BA:
            load_df = pd.read_csv(os.path.join(load_dir, f"{i}_hourly_load_data.csv"))
            population_df = pd.read_csv(os.path.join(population_dir, f"{i}_hourly_population_data.csv"))

            # Loop over the range of years defined by the 'start_year' and 'end_year' variables:
            for year in range(start_year, end_year + 1):
                # Read in the annual historical weather for that BA:
                temp_weather_df = pd.read_csv(os.path.join(weather_dir, f"{i}_WRF_Hourly_Mean_Meteorology_{year}.csv"))

                # Convert the time stamp to a datetime variable and then extract the year, month, day, and hour variables:
                temp_weather_df['Time_UTC'] = pd.to_datetime(temp_weather_df['Time_UTC'])
                temp_weather_df['Year'] = temp_weather_df['Time_UTC'].dt.strftime('%Y').astype(np.int64)
                temp_weather_df['Month'] = temp_weather_df['Time_UTC'].dt.strftime('%m').astype(np.int64)
                temp_weather_df['Day'] = temp_weather_df['Time_UTC'].dt.strftime('%d').astype(np.int64)
                temp_weather_df['Hour'] = temp_weather_df['Time_UTC'].dt.strftime('%H').astype(np.int64)

                # Only keep the columns that are needed:
                temp_weather_df = temp_weather_df[['Year', 'Month', 'Day', 'Hour', 'T2', 'Q2', 'SWDOWN', 'GLW', 'WSPD']].copy()

                # Concatenate all the years into a single dataframe:
                if year == start_year:
                    weather_df = temp_weather_df.copy()
                else:
                    weather_df = pd.concat([weather_df, temp_weather_df])

            # Merge the historical load and population dataframes together by date:
            merged_first = pd.merge(load_df, population_df, how='inner', on=['Year', 'Month', 'Day', 'Hour'])

            # Merge in the historical weather by date:
            merged_second = pd.merge(merged_first, weather_df, how='inner', on=['Year', 'Month', 'Day', 'Hour'])

            # Round the population to 2 decimal places:
            merged_second['Total_Population'] = merged_second['Total_Population'].round(2)

            # Write the merged dataframe to a .csv file
            merged_second.to_csv(os.path.join(output_dir, f"{i}_historical_data.csv"), index=False, header=True)

            # Clean up the variables and move to the next BA in the loop:
            del temp_weather_df, weather_df, load_df, population_df, merged_first, merged_second, all_data_present