#!/bin/sh

# Copyright (c) 2014-2020 Ganael LAPLANCHE <ganael.laplanche@martymac.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

# This script is a simple wrapper showing how fpart can be used to migrate data.
# It uses fpart and a copy tool (rsync or cpio) to spawn multiple instances to
# migrate data from src_dir/ to dst_url/. Jobs can execute either locally or
# over SSH.

FPSYNC_VERSION="1.2.0"

########## Default values for options

# External tool used to copy files
OPT_TOOL_NAME="rsync"
# Number of sync jobs to run in parallel ("workers", -n)
OPT_JOBS=2
# Same, but autodetected
#OPT_JOBS=$(sysctl -n hw.ncpu)  # On FreeBSD
#OPT_JOBS=$(nproc)              # On Linux
# Number of sync jobs from resumed run, read from the 'info' file
OPT_RJOBS=
# Maximum files or directories per sync job (-f)
OPT_FPMAXPARTFILES="2000"
# Maximum bytes per sync job (-s)
OPT_FPMAXPARTSIZE="$((4 * 1024 * 1024 * 1024))" # 4 GB
# Work on a per-directory basis (disabled by default)
OPT_DIRSONLY=""
# SSH workers (execute jobs locally if not defined, -w)
OPT_WRKRS=""
# Fpart shared dir (must be shared amongst all workers, -d)
OPT_FPSHDIR=""
# Temporary dir (local, used for queue management, -t)
OPT_TMPDIR="/tmp/fpsync"
# Job name for resume mode (-r)
OPT_JOBNAME=""
# User-settable tool options (-o)
OPT_TOOL=""
# Fpart options (-O)
OPT_FPART="-x .zfs -x .snapshot* -x .ckpt"
# Sudo mode (-S)
OPT_SUDO=""
# Verbose mode (-v)
OPT_VERBOSE="0"
# Source directory
OPT_SRCDIR=""
# Destination directory
OPT_DSTURL=""

# Mail - Uncomment to receive a mail when the whole run has finished.
# The master machine (the one running this script) must be able to send mail
# using the 'mail' command. No CLI option is provided to change it.
#OPT_MAIL="address@mydomain.tld"

########## Various functions

# Print help
usage () {
    echo "fpsync v${FPSYNC_VERSION} - Sync directories in parallel using fpart"
    echo "Copyright (c) 2014-2020 Ganael LAPLANCHE <ganael.laplanche@martymac.org>"
    echo "WWW: http://contribs.martymac.org"
    echo "Usage: $0 [OPTIONS...] src_dir/ dst_url/"
    echo "Usage: $0 [-r jobname] [OPTIONS...]"
    echo "  -m tool     external copy tool to use: 'rsync' (default) or 'cpio'"
    echo "  -n x        start <x> concurrent sync jobs"
    echo "  -f y        transfer at most <y> files or directories per sync job"
    echo "  -s z        transfer at most <z> bytes per sync job"
    echo "  -E          work on a per-directory basis (rsync only)"
    echo "              (WARNING!!! Enables rsync(1)'s --delete option!)"
    echo "  -w wrks     space-separated list of SSH workers"
    echo "              e.g.: -w 'login@host1 login@host2 login@host3'"
    echo "              or:   -w 'login@host1' -w 'login@host2' -w 'login@host3'"
    echo "              Jobs are executed locally if not specified (default)."
    echo "  -d /dir/    set fpsync shared dir to </dir/> (absolute path)"
    echo "              This option is mandatory when using SSH workers."
    echo "  -t /dir/    set fpsync temp dir to </dir/> (absolute path)"
    echo "  -r jobname  resume job <jobname>"
    echo "              (options -f, -s, -o, /src_dir/ and /dst_url/"
    echo "              are ignored when resuming a previous run)"
    echo "  -o options  override default copy tool options with <options>"
    echo "              Use this option with care as certain options are"
    echo "              incompatible with a parallel usage (e.g. rsync's --delete)"
    echo "  -O options  override default fpart options with <options>"
    echo "  -S          use sudo for filesystem crawling and synchronizations"
    echo "  -v          verbose mode (default: quiet)"
    echo "              This option can be be specified several times to"
    echo "              increase verbosity level."
    echo "  -h          this help"
    echo "  src_dir/    source directory (absolute path)"
    echo "  dst_url/    destination directory (or URL, when using rsync(1))"
    echo "See fpsync(1) for more details."
}

# Print a message to stderr and exit with error code 1
end_die () {
    [ -n "$1" ] && echo "$1" 1>&2
    exit 1
}

# Print (to stdout) and log a message
# $1 = level (0 = quiet, 1 = verbose, >=2 more verbose)
# $2 = message to log
echo_log () {
    local _ts=$(date '+%s')
    is_num "$1" && [ ${OPT_VERBOSE} -ge $1 ] && [ -n "$2" ] && \
        echo "${_ts} $2"
    [ -n "$2" ] && \
        echo "${_ts} $2" >> "${FPART_LOGFILE}"
}

# Check if $1 is an absolute path
is_abs_path() {
    echo "$1" | grep -qE "^/"
}

# Check if $1 is a valid rsync URL
# Cf. rsync(1) :
#   SSH:   [USER@]HOST:DEST
#   Rsync: [USER@]HOST::DEST
#   Rsync: rsync://[USER@]HOST[:PORT]/DEST
# Simplified as: "anything but slash" followed by at least one ":"
is_remote_path() {
    echo "$1" | grep -qE "^[^/]+:"
}

# Check if $1 is a number
is_num () {
    echo "$1" | grep -qE '^[0-9]+$'
}

# Check if $1 is an acceptable size argument
# - must be greater than 0
# - may contain 'kKmMgGtTpP' suffix
is_size () {
    echo "$1" | grep -qE '^0*[1-9][0-9]*[kKmMgGtTpP]?$'
}

# Parse user options and initialize OPT_* global variables
parse_opts () {
    local opt OPTARG OPTIND

    while getopts "m:n:f:s:Ew:d:t:r:o:O:Svh" opt
    do
        case "${opt}" in
        "m")
            if [ "${OPTARG}" = "rsync" ] || [ "${OPTARG}" = "cpio" ]
            then
                OPT_TOOL_NAME=${OPTARG}
            else
                end_die "Unsupported tool, please specify 'rsync' or 'cpio'"
            fi
            ;;
        "n")
            if is_num "${OPTARG}" && [ ${OPTARG} -ge 1 ]
            then
                OPT_JOBS=${OPTARG}
            else
                end_die "Option -n expects a numeric value >= 1"
            fi
            ;;
        "f")
            if is_num "${OPTARG}" && [ ${OPTARG} -ge 0 ]
            then
                OPT_FPMAXPARTFILES=${OPTARG}
            else
                end_die "Option -f expects a numeric value >= 0"
            fi
            ;;
        "s")
            if { is_num "${OPTARG}" && [ ${OPTARG} -ge 0 ] ;} || is_size "${OPTARG}"
            then
                OPT_FPMAXPARTSIZE=${OPTARG}
            else
                end_die "Option -s expects a numeric value >= 0"
            fi
            ;;
        "E")
            OPT_DIRSONLY="yes"
            ;;
        "w")
            if [ -n "${OPTARG}" ]
            then
                OPT_WRKRS="${OPT_WRKRS} ${OPTARG}"
            else
                end_die "Invalid workers list supplied"
            fi
            ;;
        "d")
            if is_abs_path "${OPTARG}"
            then
                OPT_FPSHDIR="${OPTARG}"
            else
                end_die "Please supply an absolute path for shared dir"
            fi
            ;;
        "t")
            if is_abs_path "${OPTARG}"
            then
                OPT_TMPDIR="${OPTARG}"
            else
                end_die "Please supply an absolute path for temp dir"
            fi
            ;;
        "r")
            if [ -n "${OPTARG}" ]
            then
                OPT_JOBNAME="${OPTARG}"
            else
                end_die "Invalid job name supplied"
            fi
            ;;
        "o")
            if [ -n "${OPTARG}" ]
            then
                OPT_TOOL="${OPTARG}"
            else
                end_die "Invalid tool options supplied"
            fi
            ;;
        "O")
            if [ -n "${OPTARG}" ]
            then
                OPT_FPART="${OPTARG}"
            else
                end_die "Invalid fpart options supplied"
            fi
            ;;
        "S")
            OPT_SUDO="yes"
            ;;
        "v")
            OPT_VERBOSE="$((${OPT_VERBOSE} + 1))"
            ;;
        "h")
            usage
            exit 0
            ;;
        *)
            usage
            end_die "Invalid option specified"
            ;;
        esac
    done
    shift $((${OPTIND} - 1))

    # Validate partitions' constraints
    if is_num "${OPT_FPMAXPARTFILES}" && [ ${OPT_FPMAXPARTFILES} -eq 0 ] && \
        is_num "${OPT_FPMAXPARTSIZE}" && [ ${OPT_FPMAXPARTSIZE} -eq 0 ]
    then
        end_die "Please specify a least a file (-f) or size (-s) limit for partitions"
    fi

    # Validate OPT_FPSHDIR (shared directory)
    if [ -z "${OPT_WRKRS}" ]
    then
        # For local jobs, set shared directory to temporary directory
        [ -z "${OPT_FPSHDIR}" ] && \
            OPT_FPSHDIR="${OPT_TMPDIR}"
    else
        # For remote ones, specifying a shared directory is mandatory
        [ -z "${OPT_FPSHDIR}" ] && \
            end_die "Please supply a shared dir when specifying workers"
    fi

    # Check for src_dir and dst_url presence and validity
    if [ -z "${OPT_JOBNAME}" ]
    then
        # Check src dir, must be an absolute path
        if is_abs_path "$1"
        then
            OPT_SRCDIR="$1"
        else
            usage
            end_die "Please supply an absolute path for src_dir/"
        fi
        # Check dst_url, must be either an absolute path or a URL
        if is_abs_path "$2" || is_remote_path "$2"
        then
            is_remote_path "$2" && [ "${OPT_TOOL_NAME}" = "cpio" ] && \
                end_die "URLs are not supported when using cpio"
            OPT_DSTURL="$2"
        else
            usage
            [ "${OPT_TOOL_NAME}" = "cpio" ] && \
                end_die "Please supply an absolute path for dst_url/"
            end_die "Please supply either an absolute path or a rsync URL for dst_url/"
        fi
    fi

    # Handle tool-related options
    case "${OPT_TOOL_NAME}" in
    "cpio")
        [ "${OPT_DIRSONLY}" = "yes" ] && \
            end_die "Option -E is invalid when using cpio"
        ;;
    *)
        # Rsync
        [ -z "${OPT_TOOL}" ] && \
            OPT_TOOL="-lptgoD -v --numeric-ids"
        ;;
    esac

}

########## Work-related functions (in-memory, running-jobs handling)

# Initialize WORK_FREEWORKERS by expanding OPT_WRKRS up to OPT_JOBS elements,
# assigning a fixed number of slots to each worker.
# Sanitize OPT_WRKRS if necessary.
work_list_free_workers_init () {
    local _OPT_WRKRS_NUM=$(echo ${OPT_WRKRS} | awk '{print NF}')
    if [ ${_OPT_WRKRS_NUM} -gt 0 ]
    then
        local _i=0
        while [ ${_i} -lt ${OPT_JOBS} ]
        do
            local _OPT_WRKRS_IDX="$((${_i} % ${_OPT_WRKRS_NUM} + 1))"
            WORK_FREEWORKERS="${WORK_FREEWORKERS} $(echo ${OPT_WRKRS} | awk '{print $'${_OPT_WRKRS_IDX}'}')"
            _i=$((${_i} + 1))
        done
    else
        OPT_WRKRS=""
        WORK_FREEWORKERS="local"
    fi
}

# Pick-up next worker
work_list_pick_next_free_worker () {
    echo "${WORK_FREEWORKERS}" | awk '{print $1}'
}

# Remove next worker from list
work_list_trunc_next_free_worker () {
    WORK_FREEWORKERS="$(echo ${WORK_FREEWORKERS} | sed -E 's/^[[:space:]]*[^[:space:]]+[[:space:]]*//')"
}

# Push a work to the list of currently-running ones
work_list_push () {
    if [ -n "$1" ]
    then
        WORK_LIST="${WORK_LIST} $1"
        WORK_NUM="$((${WORK_NUM} + 1))"
    fi
}

# Rebuild the currently-running jobs' list by examining each process' state
work_list_refresh () {
    local _WORK_LIST=""
    local _WORK_NUM=0
    for _JOB in ${WORK_LIST}
    do
        # If the process is still alive, keep it
        if ps "$(echo ${_JOB} | cut -d ':' -f 1)" 1>/dev/null 2>&1
        then
            _WORK_LIST="${_WORK_LIST} ${_JOB}"
            _WORK_NUM="$((${_WORK_NUM} + 1))"
        # If not, put its worker to the free list
        else
            echo_log "2" "<= [QMGR] Job ${_JOB} finished"
            if [ -n "${OPT_WRKRS}" ]
            then
                WORK_FREEWORKERS="${WORK_FREEWORKERS} $(echo ${_JOB} | cut -d ':' -f 3)"
            fi
        fi
    done
    WORK_LIST=${_WORK_LIST}
    WORK_NUM=${_WORK_NUM}
}

########## Jobs-related functions (on-disk, jobs' queue handling)

# Initialize job queue and work directories
job_queue_init () {
    mkdir -p "${JOBS_QUEUEDIR}" 2>/dev/null || \
        end_die "Cannot create job queue directory ${JOBS_QUEUEDIR}"
    mkdir -p "${JOBS_WORKDIR}" 2>/dev/null || \
        end_die "Cannot create job work directory ${JOBS_WORKDIR}"
}

# Dump job queue information to allow later resuming
job_queue_info_dump () {
    # Create "info" file
    local _TMPMASK="$(umask)"
    umask "0077"
    touch "${JOBS_QUEUEDIR}/info" 2>/dev/null
    umask "${_TMPMASK}"

    # Dump information necessary for job resuming
    cat << EOF > "${JOBS_QUEUEDIR}/info" || \
        end_die "Cannot record job information"
# Job information used for resuming, do not edit !
OPT_RJOBS="${OPT_JOBS}"
OPT_TOOL_NAME="${OPT_TOOL_NAME}"
OPT_SRCDIR="${OPT_SRCDIR}"
OPT_DSTURL="${OPT_DSTURL}"
EOF
}

job_queue_info_load () {
    # Source info file and initialize a few variables
    . "${JOBS_QUEUEDIR}/info" || \
        end_die "Cannot read job information"

    # Validate loaded options
    { is_num "${OPT_RJOBS}" && [ ${OPT_RJOBS} -ge 1 ] ;} || \
        end_die "Invalid option value loaded from resumed job: OPT_RJOBS"

    [ "${OPT_TOOL_NAME}" != "rsync" ] && [ "${OPT_TOOL_NAME}" != "cpio" ] && \
        end_die "Invalid option value loaded from resumed job: OPT_TOOL_NAME"

    is_abs_path "${OPT_SRCDIR}" || \
        end_die "Invalid options value loaded from resumed job: OPT_SRCDIR"

    if [ "${OPT_TOOL_NAME}" = "cpio" ]
    then
        is_abs_path "${OPT_DSTURL}" || \
            end_die "Invalid options value loaded from resumed job: OPT_DSTURL"
    else
        is_abs_path "${OPT_DSTURL}" || is_remote_path "${OPT_DSTURL}" || \
            end_die "Invalid options value loaded from resumed job: OPT_DSTURL"
    fi
}

# Set the "fp_done" (fpart done) flag within job queue
job_queue_fp_done () {
    sleep 1 # Ensure this very last file gets created within the next second of
            # last job file's mtime. Necessary for filesystems that don't get
            # below the second for mtime precision (msdosfs).
    touch "${JOBS_QUEUEDIR}/fp_done"
}

# Set the "sl_done" (sync loop done) flag within job queue
job_queue_sl_done () {
    touch "${JOBS_QUEUEDIR}/sl_done"
}

# Set the "sl_stop" (sync loop stop) flag within job queue
job_queue_sl_stop () {
    touch "${JOBS_QUEUEDIR}/sl_stop"
}

# Handle first ^C: stop queue processing by setting the "sl_stop" flag
# then wait for sync jobs to complete and display status before exiting
sigint_handler () {
    SIGINT_COUNT="$((${SIGINT_COUNT} + 1))"
    job_queue_sl_stop
    if [ ${SIGINT_COUNT} -eq 1 ]
    then
        echo_log "1" "===> Interrupted. Waiting for running jobs to complete..."
        echo_log "1" "===> (hit ^C again to kill them and exit)"
        # Wait for queue processing to stop
        wait
        # Display current status before exiting
        [ ${OPT_VERBOSE} -ge 1 ] && siginfo_handler
        # Exit program
        end_die
    fi
}

# Handle subsequent ^C from within job_queue_loop(): kill sync processes
# to fast-unlock the main process (waiting for child processes to exit)
job_queue_loop_sigint_handler () {
    SIGINT_COUNT="$((${SIGINT_COUNT} + 1))"
    if [ ${SIGINT_COUNT} -eq 2 ]
    then
        echo_log "1" "===> Interrupted again, killing remaining jobs"
        for _JOB in ${WORK_LIST}
        do
            kill "$(echo ${_JOB} | cut -d ':' -f 1)" 1>/dev/null 2>&1
        done
        # Wait for child processes to exit and let parent process end_die()
        wait
    fi
}

# Handle ^T: print info about queue status
siginfo_handler () {
    # Job counters, absolute values (common to several runs of the same job)
    local _jobs_total="$(cd "${FPART_PARTSDIR}" && ls -t1 | wc -l | awk '{print $1}')"
    local _jobs_done="$(cd "${JOBS_WORKDIR}" && ls -t1 | grep -v 'fp_done' | wc -l | awk '{print $1}')"
    local _jobs_remaining="$(( ${_jobs_total} - ${_jobs_done} ))"

    local _jobs_percent="??"
    [ ${_jobs_total} -ge 1 ] && \
        _jobs_percent="$(( (${_jobs_done} * 100) / ${_jobs_total} ))"

    # Time counters, relative to the current run (resume-aware)
    local _ts=$(date '+%s')
    local _run_elapsed_time="$(( ${_ts} - ${_run_start_time} ))"
    local _run_jobs_done="$(( ${_jobs_done} - ${_run_start_jobs} ))"

    local _run_time_per_job="??"
    local _run_time_remaining="??"
    if [ ${_run_jobs_done} -ge 1 ]
    then
        _run_time_per_job="$(( ${_run_elapsed_time} / ${_run_jobs_done} ))"
        _run_time_remaining="$(( (${_run_elapsed_time} * ${_jobs_remaining}) / ${_run_jobs_done} ))"
    fi

    echo "${_ts} <===   Parts done: ${_jobs_done}/${_jobs_total} (${_jobs_percent}%), remaining: ${_jobs_remaining}"
    echo "${_ts} <=== Time elapsed: ${_run_elapsed_time}s, remaining: ~${_run_time_remaining}s (~${_run_time_per_job}s/job)"
}

# Get next job name relative to ${JOBS_WORKDIR}/
# Returns empty string if no job is available
# JOBS_QUEUEDIR can host several types of file :
# <job_number>: a sync job to perform
# 'info': info file regarding this fpsync run
# 'sl_stop': the 'immediate stop' flag, set when ^C is hit
# 'fp_done': set when fpart has finished crawling src_dir/ and generated jobs
# 'sl_done': set when job_queue_loop() terminates (either normally or stopped)
job_queue_next () {
    local _NEXT=""
    if [ -f "${JOBS_QUEUEDIR}/sl_stop" ]
    then
        echo "sl_stop"
    else
        _NEXT=$(cd "${JOBS_QUEUEDIR}" && ls -rt1 | grep -v "info" | head -n 1)
        if [ -n "${_NEXT}" ]
        then
            mv "${JOBS_QUEUEDIR}/${_NEXT}" "${JOBS_WORKDIR}" || \
                end_die "Cannot dequeue next job"
            echo "${_NEXT}"
        fi
    fi
}

# Main jobs' loop: pick up jobs within the queue directory and start them
job_queue_loop () {
    echo_log "2" "===> [QMGR] Starting queue manager"

    # Trap SIGINT
    trap 'job_queue_loop_sigint_handler' 2
    # Ignore SIGINFO from within loop, handled by the parent (master) process
    trap '' 29

    local _NEXT=""
    while [ "${_NEXT}" != "fp_done" ] && [ "${_NEXT}" != "sl_stop" ]
    do
        local _PID=""
        if [ ${WORK_NUM} -lt ${OPT_JOBS} ]
        then
            _NEXT="$(job_queue_next)"
            if [ -n "${_NEXT}" ] && \
                [ "${_NEXT}" != "fp_done" ] && \
                [ "${_NEXT}" != "sl_stop" ]
            then
                if [ -z "${OPT_WRKRS}" ]
                then
                    echo_log "2" "=> [QMGR] Starting job ${JOBS_WORKDIR}/${_NEXT} (local)"
                    /bin/sh "${JOBS_WORKDIR}/${_NEXT}" &
                    work_list_push "$!:${_NEXT}:local"
                else
                    local _NEXT_HOST="$(work_list_pick_next_free_worker)"
                    work_list_trunc_next_free_worker
                    echo_log "2" "=> [QMGR] Starting job ${JOBS_WORKDIR}/${_NEXT} -> ${_NEXT_HOST}"
                    "${SSH_BIN}" "${_NEXT_HOST}" '/bin/sh -s' \
                        < "${JOBS_WORKDIR}/${_NEXT}" &
                    work_list_push "$!:${_NEXT}:${_NEXT_HOST}"
                fi
            fi
        fi
        work_list_refresh
        sleep 0.2
    done

    if [ "${_NEXT}" = "fp_done" ]
    then
        echo_log "2" "<=== [QMGR] Done submitting jobs. Waiting for them to finish."
    else
        echo_log "2" "<=== [QMGR] Stopped. Waiting for jobs to finish."
    fi
    wait
    echo_log "2" "<=== [QMGR] Queue processed"

    # Set the 'sl_done' (sync done) flag to let the master process go
    job_queue_sl_done
}

########## Program start (main() !)

# Parse command-line options
parse_opts "$@"

## Options' post-processing section

# Job name initialization
if [ -n "${OPT_JOBNAME}" ]
then
    # Resume mode, check if OPT_JOBNAME exists
    if [ -d "${OPT_TMPDIR}/queue/${OPT_JOBNAME}" ] && \
        [ -d "${OPT_TMPDIR}/work/${OPT_JOBNAME}" ]
    then
        FPART_JOBNAME="${OPT_JOBNAME}"
    else
        end_die "Could not find specified job's queue and work directories"
    fi
else
    # Generate a unique job name. This job name *must* remain
    # unique from one job to another.
    FPART_JOBNAME="$(date '+%s')-$$"
fi

# Queue manager configuration. This queue remains local, even when using SSH.
JOBS_QUEUEDIR="${OPT_TMPDIR}/queue/${FPART_JOBNAME}"   # Queue dir.
JOBS_WORKDIR="${OPT_TMPDIR}/work/${FPART_JOBNAME}"     # Current jobs' dir.

# Paths to executables that must exist locally
FPART_BIN="$(command -v fpart)"
SSH_BIN="$(command -v ssh)"
MAIL_BIN="$(command -v mail)"

# Paths to executables that must exist both locally and remotely
SUDO_BIN="$(command -v sudo)"

# Paths to executables that must exist either locally or remotely (depending
# on if you use SSH or not). When using SSH, the following binaries must be
# present at those paths on each worker.
TOOL_BIN=$(command -v "${OPT_TOOL_NAME}")

# Do we need sudo ?
if [ -n "${OPT_SUDO}" ]
then
    SUDO="${SUDO_BIN}"
else
    SUDO=""
fi

# Fpart paths. Those ones must be shared amongst all nodes when using SSH
# (e.g. through a NFS share mounted on *every* single node, including the master
# 'job submitter').
FPART_PARTSDIR="${OPT_FPSHDIR}/parts/${FPART_JOBNAME}"
FPART_PARTSTMPL="${FPART_PARTSDIR}/part"
FPART_LOGDIR="${OPT_FPSHDIR}/log/${FPART_JOBNAME}"
FPART_LOGFILE="${FPART_LOGDIR}/fpart.log"

# Prepare mode-specific tool and fpart options
# We do *not* want fpart option -zz in dirs-only mode because un-readable
# directories will be created when sync'ing the parent.
if [ -n "${OPT_DIRSONLY}" ]
then
    # Dirs-only mode, single-depth rsync(1)
    # Postpone deletion to limit impacts of a user interruption
    TOOL_MODEOPTS="-d --relative --delete --delete-after"
    FPART_MODEOPTS="-E"
else
    # File-based mode
    if [ "${OPT_TOOL_NAME}" = "cpio" ]
    then
        TOOL_MODEOPTS="-pdm"
        FPART_MODEOPTS="-zzz"
    else
        # Recursive rsync(1)
        TOOL_MODEOPTS="-r"
        FPART_MODEOPTS="-zz"
    fi
fi

# Fpart hooks (black magic is here !)
if [ "${OPT_TOOL_NAME}" = "cpio" ]
then
    # XXX Warning: -0 and --quiet are non-standard
    # (not supported on Solaris), see:
    # http://pubs.opengroup.org/onlinepubs/7908799/xcu/cpio.html
    # XXX Exec whole shell cmd as root, because we need to cwd first
    FPART_JOBCOMMAND="${SUDO} /bin/sh -c 'cd \\\"${OPT_SRCDIR}/\\\" && \
        cat \\\"\${FPART_PARTFILENAME}\\\" | \
        ${SUDO} ${TOOL_BIN} ${OPT_TOOL} -0 --quiet ${TOOL_MODEOPTS} \
        \\\"${OPT_DSTURL}/\\\"' \
        1>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stdout\" \
        2>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stderr\""
else
    FPART_JOBCOMMAND="/bin/sh -c '${SUDO} ${TOOL_BIN} ${OPT_TOOL} \
        ${TOOL_MODEOPTS} --files-from=\\\"\${FPART_PARTFILENAME}\\\" --from0 \
        \\\"${OPT_SRCDIR}/\\\" \
        \\\"${OPT_DSTURL}/\\\"' \
        1>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stdout\" \
        2>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stderr\""
fi
FPART_POSTHOOK="echo \"${FPART_JOBCOMMAND}\" > \
    \"${JOBS_QUEUEDIR}/\${FPART_PARTNUMBER}\" && \
    [ ${OPT_VERBOSE} -ge 2 ] && \
    echo \"\$(date '+%s') ==> [FPART] Partition \${FPART_PARTNUMBER} written\"" # [1]

# [1] Be careful to host the job queue on a filesystem that can handle
# fine-grained mtime timestamps (i.e. with a sub-second precision) if you want
# the queue to be processed in order when fpart generates several job files per
# second.
# On FreeBSD, vfs timestamps' precision can be tuned using the
# vfs.timestamp_precision sysctl. See vfs_timestamp(9).

## End of options' post-processing section, let's start for real now !

SIGINT_COUNT="0"    # ^C counter

WORK_NUM=0          # Current number of running processes
WORK_LIST=""        # Work PID:PART:WORKER list
WORK_FREEWORKERS="" # Free workers' list

# Check for essential binaries
[ ! -x "${FPART_BIN}" ] && \
    end_die "Fpart is missing locally, check your configuration"
[ -n "${OPT_WRKRS}" ] && [ ! -x "${SSH_BIN}" ] && \
    end_die "SSH is missing locally, check your configuration"
[ -n "${OPT_MAIL}" ] && [ ! -x "${MAIL_BIN}" ] && \
    end_die "Mail is missing locally, check your configuration"
[ -n "${OPT_SUDO}" ] && [ ! -x "${SUDO}" ] && \
    end_die "Sudo is missing locally, check your configuration"

# Create / check for fpart shared directories
if [ -z "${OPT_JOBNAME}" ]
then
    # For a new job, create those directories
    mkdir -p "${FPART_PARTSDIR}" 2>/dev/null || \
        end_die "Cannot create partitions' output directory: ${FPART_PARTSDIR}"
    mkdir -p "${FPART_LOGDIR}" 2>/dev/null || \
        end_die "Cannot create log directory: ${FPART_LOGDIR}"
else
    # In resume mode, FPART_PARTSDIR and FPART_LOGDIR must already exist
    if [ ! -d "${FPART_PARTSDIR}" ] || [ ! -d "${FPART_LOGDIR}" ]
    then
        end_die "Could not find specified job's 'parts' and 'log' directories"
    fi
fi

# Create or update log file
touch "${FPART_LOGFILE}" 2>/dev/null || \
    end_die "Cannot create log file: ${FPART_LOGFILE}"

# Create / check for job and work queues
if [ -z "${OPT_JOBNAME}" ]
then
    # For a new job, create those directories
    job_queue_init
else
    # When resuming a job, check if :
    # - the last fpart pass has completed
    #   (the 'fp_done' flag is present)
    # - the last fpsync pass has *not* completed
    #   (the 'fp_done' flag is *still* present)
    # - we can get the number of workers previously implied
    #   (the 'info' flag is present)
    # - the work queue exists
    if [ ! -f "${JOBS_QUEUEDIR}/fp_done" ] || [ ! -f "${JOBS_QUEUEDIR}/info" ]
    then
        end_die "Specified job is not resumable ('fp_done' or 'info' flag missing)"
    fi
    [ ! -d "${JOBS_WORKDIR}" ] && \
        end_die "Specified job is not resumable (work queue missing)"

    # Job is resumable, try to reload job options and prepare queues
    job_queue_info_load
    # Remove the "sl_stop" and "sl_done" flags, if any
    rm -f "${JOBS_QUEUEDIR}/sl_stop" 2>/dev/null
    rm -f "${JOBS_QUEUEDIR}/sl_done" 2>/dev/null
    # Move potentially-incomplete jobs to the jobs queue so that they can be
    # executed again. We consider the worst-case scenario and resume OPT_RJOBS
    # last jobs, some of them being partially finished.
    for _file in \
        $(cd "${JOBS_WORKDIR}" && ls -t1 | head -n "${OPT_RJOBS}")
    do
        mv "${JOBS_WORKDIR}/${_file}" "${JOBS_QUEUEDIR}" 2>/dev/null ||
            end_die "Cannot resume specified job"
    done
fi

# Validate src_dir/ locally (needed for fpart) for first runs or local ones
if [ -z "${OPT_JOBNAME}" ] || [ -z "${OPT_WRKRS}" ]
then
    [ ! -d "${OPT_SRCDIR}" ] && \
        end_die "Source directory does not exist (or is not a directory): ${OPT_SRCDIR}"
fi

# When using SSH, validate src_dir/ and dst_url/ remotely and check for tool
# presence (this also allows checking SSH connectivity to each declared host)
if [ -n "${OPT_WRKRS}" ]
then
    echo_log "2" "=====> Validating requirements on SSH nodes..."

    _FIRST_HOST="$(echo ${OPT_WRKRS} | awk '{print $1}')"
    for _host in ${OPT_WRKRS}
    do
        # Check for sudo presence (it must be passwordless)
        if [ -n "${OPT_SUDO}" ]
        then
            "${SSH_BIN}" "${_host}" "${SUDO} /bin/sh -c ':' 2>/dev/null" || \
                end_die "Sudo executable not found or requires password on target ${_host}"
        fi

        # When using a local (or NFS-mounted) dest dir...
        if is_abs_path "${OPT_DSTURL}"
        then
            # ...blindly try to create dst_url/ as well as a witness file on the first
            # node. Using a witness file will allow us to check for its
            # presence/visibility from other nodes, avoiding "split-brain"
            # situations where dst_url/ exists but is not shared amongst all nodes
            # (typically a local mount point where the shared storage area *should*
            # be mounted but isn't, for any reason).
            [ "${_host}" = "${_FIRST_HOST}" ] && \
                "${SSH_BIN}" "${_host}" "/bin/sh -c 'mkdir -p \"${OPT_DSTURL}\" && \
                    ${SUDO} touch \"${OPT_DSTURL}/${FPART_JOBNAME}\"' 2>/dev/null"
        fi

        # Check for src_dir/ presence
        "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -d \"${OPT_SRCDIR}\" ]'" || \
            end_die "Source directory does not exist on target ${_host} (or is not a directory): ${OPT_SRCDIR}"

        # Check for dst_url/ presence (witness file)
        if is_abs_path "${OPT_DSTURL}"
        then
            "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -f \"${OPT_DSTURL}/${FPART_JOBNAME}\" ]'" || \
                end_die "Destination directory (shared) is not available on target ${_host}: ${OPT_DSTURL}"
        fi

        # Finally, check for tool presence
        "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -x \"${TOOL_BIN}\" ]'" || \
            end_die "Tool ${OPT_TOOL_NAME} not found on target ${_host}"

        echo_log "2" "<=== ${_host}: OK"
    done

    # Remove witness file
    if is_abs_path "${OPT_DSTURL}"
    then
        "${SSH_BIN}" "${_FIRST_HOST}" \
            "/bin/sh -c '${SUDO} rm -f \"${OPT_DSTURL}/${FPART_JOBNAME}\"' 2>/dev/null"
    fi

    unset _FIRST_HOST
else
    # Local usage - create dst_url/ and check for tool presence
    if is_abs_path "${OPT_DSTURL}" && [ ! -d "${OPT_DSTURL}" ]
    then
        mkdir -p "${OPT_DSTURL}" 2>/dev/null || \
            end_die "Cannot create destination directory: ${OPT_DSTURL}"
    fi
    [ ! -x "${TOOL_BIN}" ] && \
        end_die "Tool ${OPT_TOOL_NAME} is missing locally, check your configuration"
fi

# Dispatch OPT_WRKRS into WORK_FREEWORKERS
work_list_free_workers_init

# Let's rock !
echo_log "2" "=====> [$$] Syncing ${OPT_SRCDIR} => ${OPT_DSTURL}"
echo_log "1" "===> Job name: ${FPART_JOBNAME}$([ -n "${OPT_JOBNAME}" ] && echo ' (resumed)')"
echo_log "2" "===> Start time: $(date)"
echo_log "2" "===> Concurrent sync jobs: ${OPT_JOBS}"
echo_log "2" "===> Workers: $(echo "${OPT_WRKRS}" | sed -E -e 's/^[[:space:]]+//' -e 's/[[:space:]]+/ /g')$([ -z "${OPT_WRKRS}" ] && echo 'local')"
echo_log "2" "===> Shared dir: ${OPT_FPSHDIR}"
echo_log "2" "===> Temp dir: ${OPT_TMPDIR}"
echo_log "2" "===> Tool name: \"${OPT_TOOL_NAME}\""
# The following options are ignored when resuming
if [ -z "${OPT_JOBNAME}" ]
then
    echo_log "2" "===> Tool options: \"${OPT_TOOL}\""
    echo_log "2" "===> Max files or directories per sync job: ${OPT_FPMAXPARTFILES}"
    echo_log "2" "===> Max bytes per sync job: ${OPT_FPMAXPARTSIZE}"
fi

# Record job information
job_queue_info_dump

# Record initial status, required by siginfo_handler()
_run_start_jobs="$(cd "${JOBS_WORKDIR}" && ls -t1 | grep -v 'fp_done' | wc -l | awk '{print $1}')"
_run_start_time="$(date '+%s')"

# Set SIGINT and SIGINFO traps and start job_queue_loop
trap 'sigint_handler' 2
trap 'siginfo_handler' 29
echo_log "2" "===> Use ^C to abort, ^T (SIGINFO) to display status"
job_queue_loop&

# When not resuming a previous job, start fpart
if [ -z "${OPT_JOBNAME}" ]
then
    echo_log "1" "===> Analyzing filesystem..."
    # Start fpart from src_dir/ directory and produce jobs within
    # ${JOBS_QUEUEDIR}/
    cd "${OPT_SRCDIR}" && \
        ${SUDO} "${FPART_BIN}" \
        $([ ${OPT_FPMAXPARTFILES} -gt 0 ] && echo -f "${OPT_FPMAXPARTFILES}") \
        $({ { is_num "${OPT_FPMAXPARTSIZE}" && [ ${OPT_FPMAXPARTSIZE} -gt 0 ] ;} || is_size "${OPT_FPMAXPARTSIZE}" ;} && echo -s "${OPT_FPMAXPARTSIZE}") \
        -o "${FPART_PARTSTMPL}" -0 -e ${OPT_FPART} ${FPART_MODEOPTS} -L \
        -W "${FPART_POSTHOOK}" . 2>&1 | \
        tee -a "${FPART_LOGFILE}"
fi

# Tell job_queue_loop that crawling has finished
job_queue_fp_done

# Wait for job_queue_loop to terminate
# Use an active wait to allow signal processing (^T)
echo_log "1" "===> Waiting for sync jobs to complete..."
while [ ! -f "${JOBS_QUEUEDIR}/sl_done" ]
do
    sleep 0.2
done

# Display final status
[ ${OPT_VERBOSE} -ge 1 ] && siginfo_handler

# Examine results and send an e-mail if requested
RET=$(find "${FPART_LOGDIR}/" -name "*.stderr" ! -size 0)
MSG=$( { [ -z "${RET}" ] && echo 'Fpsync completed without error.' ;} || \
    { echo "Fpsync completed with errors, see logs:" && echo "${RET}" ;} )
if [ -n "${OPT_MAIL}" ]
then
    echo "${MSG}" | ${MAIL_BIN} -s "Fpsync job ${FPART_JOBNAME}" "${OPT_MAIL}"
fi
echo_log "1" "<=== ${MSG}"
echo_log "2" "<=== End time: $(date)"

[ -n "${RET}" ] && exit 1
exit 0
