run-pipeline

#!/usr/bin/perl
#SBATCH --cpus-per-task=64
#SBATCH --nodes=1
#SBATCH -p main,viz
#SBATCH --account=snap
#SBATCH --export=ALL,SLURMBATCH=yes
use strict;
use warnings;

=head1 NAME

run-pipeline - Execute all or part of the data-processing pipeline

=head1 SYNOPSIS

    run-pipeline [-h|--help] [-n|--dry-run] [-p|--procs PROCS]
        [-d|--dir DATA_DIR] [-e|--python EXEC] [--script-dir DIR]
        [-g|--groups DATA_GROUPS] [-s|--steps STEPS]
    run-pipeline ls

    Options:
        -h|--help:      Print help information and exit
        -n|--dry-run:   Show which steps of the pipeline will get run
                        but don't actually run them.
        -e|--python     Override the path the to the python executable.
                        You'll want this if using SLURM so the code
                        still runs in your virtual environment.
        -p|--procs:     Maximum number of parallel processes to run. Not
                        all steps in the pipeline use this.
        -d|--dir:       Specify the directory where the data for the
                        pipeline lies. Defaults to:
                        '/workspace/Shared/Tech_Projects/DOT/project_data/wrf_pcpt/'
        --script-dir:   Specify the directory where the python scripts
                        for the pipeline steps lie. Defaults to './pipeline'
                        relative to the directory that this script is in.
                        The SLURM sbatch command makes a copy of the script
                        and puts it somewhere else, so you'll need to use this
                        if running through SLURM.
        -g|--groups:    Specify which data groups to run.
                        Defaults to: 'all'
        -s|--steps:     Specify which steps to run.
                        Defaults to: 'all'
        -v|--variant:   A string to append to the end of each directory in the
                        pipeline, useful for creating a parallel "branch" in
                        the pipeline. If an input directory with the variant
                        name isn't found, the default name will be used.

DATA_GROUPS is a comma-separated list of the data groups/models to execute
the pipeline for. The elements of the list can be any of the following values:

    NCAR-CCSM4_historical
    NCAR-CCSM4_rcp85
    ERA-Interim_historical
    GFDL-CM3_historical
    GFDL-CM3_rcp85

Alternatively, this argument can simply be the string "all" to execute 
for all groups. Technically, the values given here are treated as regular
expressions, so you can sepcify substrings like 'historical' or 'GFDL-CM3'
to match multiple groups.
Some steps in the pipeline may require specific data groups
or combinations of them. For example, the 'deltas' step and beyond requires
that both the 'rcp85' and 'historical' models for GFDL-CM3 or NCAR-CCSM4
be included.

STEPS is a comma-separated list of steps to run in the pipeline. The format
is analgous to the field selection of the cut(1) command. Each element of the
list can be either the name of a step, or a range in one of the formats
below:

    NAME        The single step with NAME
    NAME-       The step with NAME and every step after it
    NAME-NAME2  All the steps from NAME to NAME2 (inclusive)
    -NAME       The steps from the first one up through NAME

Every step will only be run once and always in the order that the script
defines for the pipeline, REGARDLESS of how you specify them in the list.
Like with DATA_GROUPS, this can also be the string "all" to execute all steps.

Execute `run-pipeline ls` to see a list of the names for all the steps in
the pipeline and a description of each step, as well as a list of all 
possible data group names.

=cut

#######################
# Global data and flags
#######################

use File::Basename;
use File::Spec qw(rel2abs);

# Ordering of steps
my @order = qw(durations ams intervals diff deltas warp multiply fudge undiff fudge_ci);

# Mapping of steps to descriptions
my %descriptions = (
    'durations' => "Compute durations series from raw hourly data",
    'ams'       => "Compute annual maximum series form durations series",
    'intervals' => "Compute return intervals (with confidence bounds) from annual maximum series",
    'diff'     => "Rewrite return interval confidence bounds as differences from the median",
    'deltas'    => "Compute ratios of differences between historical and projected data groups",
    'warp'      => "Warp grids of deltas to match grid of NOAA Atlas 14 data",
    'multiply'  => "Multiply by NOAA Atlas 14 data by deltas for final product.",
    'fudge'     => "Fudge values for consistency",
    'undiff'    => "Re-apply the diffs created in the diff step and add header information to data"
    'fudge_ci'  => "Fudge confidence intervals values for consistency"
);

# Mapping indicating whether each step should be performed.
# (Defaults to true for all)
my %do_steps = map { $_ => 1 } @order;

# List of possible data set names
my @allowed_data_groups = qw(
    NCAR-CCSM4_historical
    NCAR-CCSM4_rcp85
    ERA-Interim_historical
    GFDL-CM3_historical
    GFDL-CM3_rcp85
);

# Mapping indicating whether each data group is used
# (Defaults to true for all)
my %use_group = map { $_ => 1} @allowed_data_groups;

# Default arguments/flags
my $dry_run     = 0;
my $python_exec = 'python3';
my $in_slurm    = defined $ENV{'SLURMBATCH'};
my $max_procs   = 5;
my $data_dir    = '/workspace/Shared/Tech_Projects/DOT/project_data/wrf_pcpt/';
my $script_dir  = dirname($0) . "/pipeline";
my $variant;

#######################
# MAIN
#######################

# Print list of steps and data groups if 'ls' is the
# first (and only) argument
if (@ARGV == 1 && $ARGV[0] eq 'ls') {
    print "The steps of the pipeline are (in order):\n";
    foreach (0..$#order) {
        printf "\t\033[1m%d: %-10s\033[0m\t%s\n", $_+1, $order[$_], $descriptions{$order[$_]};
    }
    print "The allowed data groups are:\n";
    foreach (@allowed_data_groups) {
        print "\033[1m\t$_\033[0m\n";
    }
    exit 0;
}

use Getopt::Long qw(GetOptions Configure);
use Pod::Usage qw(pod2usage);

# Parse Options
Configure qw'auto_help';
GetOptions(
    'dry-run|n'     => \$dry_run,
    'procs|p=i'     => \$max_procs,
    'python|e=s'    => \$python_exec,
    'dir|d=s'       => \$data_dir,
    'script-dir=s'  => \$script_dir,
    'groups|g=s'    => \&parse_data_groups,
    'steps|s=s'     => \&parse_steps,
    'variant|v=s'   => \$variant
) or die "Invalid options!";

# There should be no other arguments after parsing options
pod2usage("Invalid arguments.") if (@ARGV);

# Confirm that data directory exists.
die "Data directory, $data_dir, is not a directory." unless ($dry_run || -d $data_dir);

#######################
# STAGE 1
#######################

# Stage 1 consists of the steps from durations through diffs.
# It is repeated for each of the five data groups.

for my $group (grep $use_group{$_}, @allowed_data_groups) {
    print "\033[1m>>> $group <<<\033[0m\n";

    #
    # Durations
    #
    exec_script(
        name    => 'Durations',
        indir   => 'pcpt',
        outdir  => 'durations',
        script  => 'durations.py',
        group   => $group
    ) if $do_steps{durations};

    #
    # AMS
    #
    exec_script(
        name    => 'AMS',
        indir   => 'durations',
        outdir  => 'ams',
        script  => 'ams.py',
        group   => $group
    ) if $do_steps{ams};

    #
    # Intervals
    #
    exec_script(
        name    => 'Intervals',
        indir   => 'ams',
        outdir  => 'intervals',
        script  => 'intervals.py',
        group   => $group,
        extra_args => ['-n', $max_procs]
    ) if $do_steps{intervals};

    #
    # Diffs
    #
    exec_script(
        name    => 'Diffs',
        indir   => 'intervals',
        outdir  => 'diff',
        script  => 'diff.py',
        group   => $group
    ) if $do_steps{diff};

} # end FOR $group

# If no steps after 'intervals' are being performed, we can stop now.
exit 0 unless (grep $do_steps{$_}, @order[get_order('deltas')..$#order]);

#######################
# STAGE 2
#######################

#
# The steps from this point forward act on pairs of historical and 
# projected (rcp85) models, so for either the NCAR-CCSM4 or GFDL-CM3 models,
# both the historical and rcp85 versions need to have been included.
#

# Print warnings to the user if any specified groups cannot be used.
if ($use_group{'ERA-Interim_historical'}) {
    print STDERR "\033[33;1mWARNING:\033[0m ERA-Interim_historical is not used".
        " for the remaining steps, so it will be ignored.\n"
}
if ($use_group{'NCAR-CCSM4_historical'} xor $use_group{'NCAR-CCSM4_rcp85'}) {
    print STDERR "\033[33;1mWARNING:\033[0m Both the historical and projected".
        " (rcp85) versions of the NCAR-CCSM4 model must be specified.".
        " But only one was specified, so it will be ignored.\n"
}
if ($use_group{'GFDL-CM3_historical'} xor $use_group{'GFDL-CM3_rcp85'}) {
    print STDERR "\033[33;1mWARNING:\033[0m Both the historical and projected".
        " (rcp85) versions of the GFDL-CM3 model must be specified.".
        " But only one was specified, so it will be ignored.\n"
}

# Get which pairs of models were specified
my %pairs = (
    'NCAR-CCSM4' => 
        $use_group{'NCAR-CCSM4_historical'} && $use_group{'NCAR-CCSM4_rcp85'},
    'GFDL-CM3'   =>
        $use_group{'GFDL-CM3_historical'} && $use_group{'GFDL-CM3_rcp85'}
);
my @valid_pairs = grep {$pairs{$_}} keys %pairs;
unless (@valid_pairs) {
    print STDERR "\033[31;1mERROR:\033[0m No valid sets of data groups were".
        " specified for the remaining steps.\n";
    exit 1;
}

for my $group (@valid_pairs) {
    print "\033[1m>>> $group <<<\033[0m\n";
    #
    # Deltas
    #
    exec_script(
        name    => 'Deltas',
        indir   => 'diff',
        outdir  => 'deltas',
        script  => 'deltas.py',
        group   => $group
    ) if $do_steps{deltas};

    #
    # Warp
    #
    exec_script(
        name    => 'Warp',
        indir   => [ 'deltas', 'NOAA' ],
        outdir  => 'warp',
        script  => 'warp.py',
        group   => $group,
        extra_args  => ['-a', "$data_dir/NOAA/"]
    ) if $do_steps{warp};

    #
    # Multiply
    #
    exec_script(
        name    => 'Multiply',
        indir   => [ 'warp', 'NOAA' ],
        outdir  => 'multiply',
        script  => 'multiply.py',
        group   => $group,
        extra_args  => ['-a', "$data_dir/NOAA/"]
    ) if $do_steps{multiply};

    #
    # Fudge
    #
    exec_script(
        name    => 'Fudging',
        indir   => 'multiply',
        outdir  => 'fudge',
        script  => 'fudge.py',
        group   => $group
    ) if $do_steps{fudge};

    #
    # Undiff
    #
    exec_script(
        name    => 'Undiff',
        indir   => 'fudge',
        outdir  => 'undiff',
        script  => 'undiff.py',
        group   => $group
    ) if $do_steps{undiff};
    
    #
    # Fudge CI
    #
    exec_script(
        name    => 'Fudging CI',
        indir   => 'undiff',
        outdir  => 'fudgeci',
        script  => 'fudge_ci.py',
        group   => $group
    ) if $do_steps{fudge_ci};
}

#######################
# Helper functions
#######################

use List::Util qw(first);
use File::Path qw(make_path);

=for comment
Execute a python script for a single step.
Arguments are interpreted as a hash with the following options.
- name:         Name displayed in CLI output for this step
- indir:        Directory containing input files for this step
                    (if this is an arrayref, multiple input directories
                     can be specified, but only the first one is passed to
                     the script, the others are just checked for existence)
- outdir:       Directory to put output files from this step
- group:        Data Group for this step
- extra_args:   Array of additional arguments to pass to python script
=cut
sub exec_script {
    my %params = @_;
    print "Step: \033[1m${\$params{name}}\033[0m...\n";

    # Confirm existence of input and output directories
    # (unless its a dry run, in which case we don't care)
    my $main_indir;
    unless ($dry_run) {
        if (ref $params{indir} eq 'ARRAY') {
            $main_indir = ( map check_input_dir($_), @{$params{indir}} )[0];
        }
        else {
            $main_indir = check_input_dir($params{indir});
        }
        create_output_dir($params{outdir});
    }

    # If its a dry run, just set $main_indir without checking for existence.
    if ($dry_run) {
        $main_indir = ref $params{indir} eq 'ARRAY' ? $params{indir}->[0] : $params{indir};
        $main_indir .= $variant ? "-$variant" : "";
    }

    my $outdir = $variant ? $params{outdir}."-$variant" : $params{outdir};
    # Assemble command
    my @command = (
        "$python_exec", "$script_dir/".$params{script},
        '-p', "$data_dir/".$main_indir,
        '-o', "$data_dir/".$outdir,
        '-d', $params{group}
    );
    push   (@command, @{$params{extra_args}})   if $params{extra_args};
    unshift(@command, 'srun'                )   if ($in_slurm);

    # Execute command
    # (or just print it if this is a dry run)
    if ($dry_run) {
        print "\033[3;36m@command\033[0m\n";
    } else {
        system @command;
        if ( ($? >> 8) != 0) {
            print STDERR "\033[31;1mERROR:\033[0m Error occured executing step! Aborting...\n";
            exit 2;
        }
    }
}

=for comment
Get the order index of the specified step
(or die if the name doesn't match anything)
=cut
sub get_order {
    my $step = shift @_;
    my $idx = first { $order[$_] eq $step } 0..$#order;
    die "Invalid step name '$step'." unless (defined($idx));
    return $idx;
}

=for comment
Parser for DATA_GROUPS option.
Parses comma-separated list of group names and dies if it reaches
an invalid name.
=cut
sub parse_data_groups {
    my $val = $_[1];

    return if ($val eq 'all');  # all is the default already, so do nothing
                                # if 'all' was specified.

    # Otherwise, assume all groups are false and just add the ones
    # that the user specified.
    $use_group{$_} = 0 foreach (keys %use_group);
    foreach my $name (split /,/, $val) {
        my @matches = grep m/$name/, @allowed_data_groups
            or die "'$name' did not match any data groups.";
        # Alert the user if a specified name matched more than one group
        # (in case it was accidental)
        if (@matches > 1) {
            print STDERR "\033[36;1mINFO:\033[0m Pattern '$name' matched".
                " multiple groups: " . join(', ', @matches) . "\n";
        }
        foreach my $group (@matches) {
            $use_group{$group} = 1;
        }
    }
}

=for comment
Parse for STEPS option.
Parses a list of ranges (using the same syntax as the cut command) for
the steps and dies if it reaches an invalid step name or invalid syntax.
=cut
sub parse_steps {
    my $val = $_[1];

    return if ($val eq 'all');  # all is the default already, so do nothing
                                # if 'all' was specified.

    # Otherwise, assume all steps are false and just add the ones
    # that the user specified.
    $do_steps{$_} = 0 foreach (keys %do_steps);
    foreach (split /,/, $val) {
        # For each segment the user specified, determine the appropriate slice
        # of the step ordering and set those steps to true.
        my @slice;
        if      (/^\w+$/)           { @slice = get_order($_);               }
        elsif   (/^-(\w+)$/)        { @slice = 0..get_order($1);            }
        elsif   (/^(\w+)-$/)        { @slice = get_order($1)..$#order;      }
        elsif   (/^(\w+)-(\w+)$/)   { @slice = get_order($1)..get_order($2);}
        else { die "Invalid list syntax."; }

        $do_steps{$_} = 1 foreach (@order[@slice]);
    }
}

=for comment
Check that a directory in the data directory exists.
Die if it does not.
If $variant is set, then looks for a directory name with "-$variant" appended
to it, but a directory name without the variant will still be accepted. Returns
the name of the found directory.
=cut
sub check_input_dir {
    my $dir    = shift;
    
    if (defined $variant) {
        my $vardir = "$dir-$variant";
        return $vardir if -d "$data_dir/$dir-$variant";
    }

    return $dir if -d "$data_dir/$dir";

    print STDERR "\033[31;1mERROR:\033[0m Directory '$dir' does not exist in data directory.\n";
    exit 1;
}

=for comment
Create a directory in the data directory.
Die if unsuccessful.
If $variant is set, creates a directory with "-$variant" appended to the name.
=cut
sub create_output_dir {
    my $dir = shift;
    $dir .= "-$variant" if ($variant);
    make_path("$data_dir/$dir", {error => \my $err});
    if ($err && @$err) {
        print STDERR "\033[31;1mERROR:\033[0m Unable to create directory '$dir' in data directory.\n";
        exit 1;
    }
}