112 lines
2.4 KiB
Bash
Executable File
112 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
CONFIG_PATH="config.yaml"
|
|
DATASET_DIR="examples/training/data/flower_photos_sagemaker"
|
|
WAIT=false
|
|
SKIP_UPLOAD=false
|
|
POLL_SECONDS=60
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $0 [options]
|
|
|
|
Options:
|
|
--config PATH Path to qc-cli config file. Default: config.yaml
|
|
--dataset-dir PATH Dataset directory to upload. Default: ${DATASET_DIR}
|
|
--skip-upload Train against data already uploaded to s3.data_prefix.
|
|
--wait Poll until training completes.
|
|
-h, --help Show this help.
|
|
EOF
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--config)
|
|
CONFIG_PATH="$2"
|
|
shift 2
|
|
;;
|
|
--dataset-dir)
|
|
DATASET_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--skip-upload)
|
|
SKIP_UPLOAD=true
|
|
shift
|
|
;;
|
|
--wait)
|
|
WAIT=true
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
usage >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ ! -f "${CONFIG_PATH}" ]]; then
|
|
echo "Config not found: ${CONFIG_PATH}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "${SKIP_UPLOAD}" == false && ! -d "${DATASET_DIR}" ]]; then
|
|
echo "Dataset not found: ${DATASET_DIR}" >&2
|
|
echo "Run: bash examples/training/download_flower_photos.sh" >&2
|
|
exit 1
|
|
fi
|
|
|
|
run() {
|
|
echo "+ $*"
|
|
"$@"
|
|
}
|
|
|
|
run uv run qc-cli infra status --config "${CONFIG_PATH}"
|
|
|
|
if [[ "${SKIP_UPLOAD}" == false ]]; then
|
|
run uv run qc-cli upload "${DATASET_DIR}" --config "${CONFIG_PATH}"
|
|
fi
|
|
|
|
TRAIN_OUTPUT="$(uv run qc-cli train start --config "${CONFIG_PATH}")"
|
|
echo "${TRAIN_OUTPUT}"
|
|
|
|
JOB_NAME="$(printf '%s\n' "${TRAIN_OUTPUT}" | grep -Eo 'qc-cli-[0-9]{8}-[0-9]{6}' | tail -n 1)"
|
|
if [[ -z "${JOB_NAME}" ]]; then
|
|
echo "Could not find training job name in qc-cli output." >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "Submitted SageMaker training job: ${JOB_NAME}"
|
|
|
|
if [[ "${WAIT}" == false ]]; then
|
|
run uv run qc-cli train status "${JOB_NAME}" --config "${CONFIG_PATH}"
|
|
exit 0
|
|
fi
|
|
|
|
while true; do
|
|
STATUS_OUTPUT="$(uv run qc-cli train status "${JOB_NAME}" --config "${CONFIG_PATH}")"
|
|
echo "${STATUS_OUTPUT}"
|
|
|
|
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Completed'; then
|
|
echo "Training completed successfully."
|
|
exit 0
|
|
fi
|
|
|
|
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Failed'; then
|
|
echo "Training failed." >&2
|
|
exit 1
|
|
fi
|
|
|
|
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Stopped'; then
|
|
echo "Training stopped." >&2
|
|
exit 1
|
|
fi
|
|
|
|
sleep "${POLL_SECONDS}"
|
|
done
|