#!/usr/bin/env bash set -euo pipefail DATASET_URL="https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz" DEST_DIR="${1:-examples/training/data}" ARCHIVE_PATH="${DEST_DIR}/flower_photos.tgz" RAW_DATASET_DIR="${DEST_DIR}/flower_photos" DATASET_DIR="${DEST_DIR}/flower_photos_sagemaker" CLASS_NAMES=("daisy" "dandelion" "roses" "sunflowers" "tulips") mkdir -p "${DEST_DIR}" if [[ -d "${DATASET_DIR}" ]]; then echo "Dataset already exists: ${DATASET_DIR}" echo "Use this path with run_training.py:" echo " ${DATASET_DIR}" exit 0 fi echo "Downloading TensorFlow flower_photos dataset..." if command -v curl >/dev/null 2>&1; then curl -L "${DATASET_URL}" -o "${ARCHIVE_PATH}" elif command -v wget >/dev/null 2>&1; then wget -O "${ARCHIVE_PATH}" "${DATASET_URL}" else echo "Either curl or wget is required." >&2 exit 1 fi echo "Extracting dataset..." tar -xzf "${ARCHIVE_PATH}" -C "${DEST_DIR}" echo "Preparing SageMaker directory layout..." mkdir -p "${DATASET_DIR}" for class_name in "${CLASS_NAMES[@]}"; do cp -R "${RAW_DATASET_DIR}/${class_name}" "${DATASET_DIR}/${class_name}" done echo "Dataset ready: ${DATASET_DIR}" find "${DATASET_DIR}" -mindepth 1 -maxdepth 1 -type d -print | sort