#!/bin/bash
# Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
# conditions defined in the file COPYING, which is part of this source code package.

MK_VARDIR="${MK_VARDIR:-/var/lib/check_mk_agent}"
export MK_VARDIR
TMPDIR=${TMPDIR:-/tmp}

help() {
    echo "Usage: mk-job JOB_NAME PROGRAM [ARGS...]"
    echo ""
    echo "Execute PROGRAM as subprocess while measuring performance information"
    echo "about the running process and writing it to an output file. This file"
    echo "can be monitored using Check_MK. The Check_MK Agent will forward the"
    echo "information of all job files to the monitoring server."
    echo ""
    echo "This file is being distributed with the Check_MK Agent."
}

CURRENT_USER=$(whoami)
JOB_DIR="${MK_VARDIR}/job/${CURRENT_USER}"
JOB_NAME="${1}"
TMP_FILE="${TMPDIR}/${JOB_NAME}.$$"
RUNNING_FILE="${JOB_DIR}/${JOB_NAME}.$$running"
COMPLETED_FILE="${JOB_DIR}/${JOB_NAME}"

cleanup_running_files() {
    jobdir="$1"
    jobname="$2"
    # in some situations the trap is not executed and old running files pile up.
    # here we check if the PID is actually still running and if the process
    # name includes mk-job (another process might have the same PID by now)
    for file in "${jobdir}/${jobname}."*running; do
        [ -f "$file" ] || continue # skip if file does not exist (this might be the case if the folder is empty)
        # shellcheck disable=SC2001 # can not replace subgroup with shell
        suffix=${file##*.}    # remove largest matchin prefix
        pid=${suffix%running} # remove smallest matching suffix
        # keep the file if the process is running and mk-job is in the command, otherwise: remove
        ps -p "$pid" -o command | grep "mk-job" >/dev/null || rm "$file"
    done
}

main() {

    if [ $# -lt 2 ]; then
        help >&2
        exit 1
    fi

    shift

    cleanup_running_files "$JOB_DIR" "$JOB_NAME"

    if [ ! -d "${JOB_DIR}" ]; then
        if ! mkdir -p "${JOB_DIR}" 2>/dev/null; then
            echo "ERROR: Unable to create output directory ${JOB_DIR} for user '${CURRENT_USER}'." >&2
            exit 1
        fi
    fi

    if ! type "${1}" >/dev/null 2>&1; then
        echo -e "ERROR: Cannot run ${1}. Command not found.\n" >&2
        help >&2
        exit 1
    fi

    cleanup() {
        # shellcheck disable=SC2317 # shellcheck doesn't understand trap
        rm "${RUNNING_FILE}" 2>/dev/null
    }

    echo "start_time $(perl -e 'print time')" >"${TMP_FILE}" 2>/dev/null
    cp "${TMP_FILE}" "${RUNNING_FILE}" 2>/dev/null

    if [ ! -w "${RUNNING_FILE}" ]; then
        # Looks like we are lacking the permissions to create this file..
        # In this scenario no mk-job status file is created. We simply execute the command
        rm "${TMP_FILE}" 2>/dev/null
        exec "$@"
        exit $?
    fi

    trap "cleanup" 0

    # BEGIN PLATFORM SPECIFIC CODE
    # /usr/bin/time has more comprehensive metrics under Linux.
    # Under AIX/Solaris, only real, sys and reads are available.
    /usr/bin/time -o "${TMP_FILE}" --append \
        -f "real %E\nuser %U\nsys %S\nreads %I\nwrites %O\nmax_res_kbytes %M\navg_mem_kbytes %K\ninvol_context_switches %c\nvol_context_switches %w" "$@"
    # END PLATFORM SPECIFIC CODE
    RC=$?
    echo "exit_code ${RC}" >>"${TMP_FILE}"

    mv "${TMP_FILE}" "${COMPLETED_FILE}"
    exit $RC
}

[ -z "${MK_SOURCE_ONLY}" ] && main "$@"
