command to start sagemaker training
include sample training
This commit is contained in:
111
examples/training/run_training.sh
Executable file
111
examples/training/run_training.sh
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
CONFIG_PATH="config.yaml"
|
||||
DATASET_DIR="examples/training/data/flower_photos_sagemaker"
|
||||
WAIT=false
|
||||
SKIP_UPLOAD=false
|
||||
POLL_SECONDS=60
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [options]
|
||||
|
||||
Options:
|
||||
--config PATH Path to qc-cli config file. Default: config.yaml
|
||||
--dataset-dir PATH Dataset directory to upload. Default: ${DATASET_DIR}
|
||||
--skip-upload Train against data already uploaded to s3.data_prefix.
|
||||
--wait Poll until training completes.
|
||||
-h, --help Show this help.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--config)
|
||||
CONFIG_PATH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--dataset-dir)
|
||||
DATASET_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--skip-upload)
|
||||
SKIP_UPLOAD=true
|
||||
shift
|
||||
;;
|
||||
--wait)
|
||||
WAIT=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ! -f "${CONFIG_PATH}" ]]; then
|
||||
echo "Config not found: ${CONFIG_PATH}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${SKIP_UPLOAD}" == false && ! -d "${DATASET_DIR}" ]]; then
|
||||
echo "Dataset not found: ${DATASET_DIR}" >&2
|
||||
echo "Run: bash examples/training/download_flower_photos.sh" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
run() {
|
||||
echo "+ $*"
|
||||
"$@"
|
||||
}
|
||||
|
||||
run uv run qc-cli infra status --config "${CONFIG_PATH}"
|
||||
|
||||
if [[ "${SKIP_UPLOAD}" == false ]]; then
|
||||
run uv run qc-cli upload "${DATASET_DIR}" --config "${CONFIG_PATH}"
|
||||
fi
|
||||
|
||||
TRAIN_OUTPUT="$(uv run qc-cli train start --config "${CONFIG_PATH}")"
|
||||
echo "${TRAIN_OUTPUT}"
|
||||
|
||||
JOB_NAME="$(printf '%s\n' "${TRAIN_OUTPUT}" | grep -Eo 'qc-cli-[0-9]{8}-[0-9]{6}' | tail -n 1)"
|
||||
if [[ -z "${JOB_NAME}" ]]; then
|
||||
echo "Could not find training job name in qc-cli output." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Submitted SageMaker training job: ${JOB_NAME}"
|
||||
|
||||
if [[ "${WAIT}" == false ]]; then
|
||||
run uv run qc-cli train status "${JOB_NAME}" --config "${CONFIG_PATH}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
while true; do
|
||||
STATUS_OUTPUT="$(uv run qc-cli train status "${JOB_NAME}" --config "${CONFIG_PATH}")"
|
||||
echo "${STATUS_OUTPUT}"
|
||||
|
||||
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Completed'; then
|
||||
echo "Training completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Failed'; then
|
||||
echo "Training failed." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if printf '%s\n' "${STATUS_OUTPUT}" | grep -q 'Status:.*Stopped'; then
|
||||
echo "Training stopped." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep "${POLL_SECONDS}"
|
||||
done
|
||||
Reference in New Issue
Block a user