From 57a8a0a9c4da98ce0e72686d4a863bcb97d33060 Mon Sep 17 00:00:00 2001 From: slalom Date: Fri, 29 May 2026 15:40:38 -0400 Subject: [PATCH] rename and future steps --- README.md | 42 +++++++++++++++++++++++++++++++++++++++++- src/commands/train.py | 4 ++-- src/state.py | 4 ++-- src/tracking/mlflow.py | 10 +++++++--- 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1c31319..3a97640 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ mlflow: tracking_server_name: your-tracking-server-name ``` -When MLflow is enabled, `train start` creates an MLflow run for the SageMaker job. `train status` finalizes that run once the job reaches a terminal state and registers completed model artifacts as pre-release model versions using the `prerelease-latest` MLflow alias. +When MLflow is enabled, `train start` creates an MLflow run for the SageMaker job. `train status` finalizes that run once the job reaches a terminal state and registers completed model artifacts as experiment model versions using the `experiment-latest` MLflow alias. An experiment version is an immutable trained-source artifact; it records that training produced a model, not that the model is better than earlier versions or ready for release. To open the managed SageMaker MLflow UI, request a fresh presigned URL: @@ -155,6 +155,46 @@ qc-cli train list --limit 3 Show a custom number of recent jobs The expected output artifact is SageMaker’s `model.tar.gz`, normally containing the trained model file your container writes to `/opt/ml/model`. +## Model lifecycle + +The CLI uses neutral experiment naming for trained artifacts and reserves release terminology for an explicit promotion step. + +Current behavior: + +1. `qc-cli train start` submits a SageMaker training job. +2. `qc-cli train status` finalizes the MLflow run after the job reaches a terminal state. +3. If the job completed and `mlflow.register_trained_models` is enabled, the SageMaker `model.tar.gz` is registered as a new MLflow model version with: + - `qc_cli.stage=experiment` + - `qc_cli.artifact_kind=trained_source` + - `qc_cli.source=sagemaker` +4. The MLflow alias `experiment-latest` points at the most recently registered experiment version. + +Planned AI Hub extension: + +1. AI Hub compile or quantize will create deployable derived artifacts from a trained-source experiment. +2. Derived artifacts will keep lineage back to the source experiment version instead of replacing it. +3. Release aliases such as `v1` or `production` will point at the selected deployable artifact. + +Example future metadata: + +```text +qc-cli-model version 12 +qc_cli.stage=experiment +qc_cli.artifact_kind=trained_source +qc_cli.source=sagemaker + +qc-cli-model-aihub version 3 +qc_cli.stage=ai_hub_compiled +qc_cli.artifact_kind=deployable +qc_cli.parent_registered_model_name=qc-cli-model +qc_cli.parent_model_version=12 +qc_cli.runtime=tflite +qc_cli.quantization=int8 +qc_cli.target_device=Samsung Galaxy S25 +``` + +In that flow, `experiment-latest` remains a training convenience alias. Release selection is a separate promotion decision based on the derived artifact, not on the experiment name. + ## AWS permissions required The IAM user or role running the CLI needs: diff --git a/src/commands/train.py b/src/commands/train.py index 12b82b5..5404d1a 100644 --- a/src/commands/train.py +++ b/src/commands/train.py @@ -148,8 +148,8 @@ def status( updates["registered_model_version"] = version st.update_training_job(job_name, **updates) if version: - st.set_latest_prerelease_model_version(version) - CONSOLE.print(f"MLflow model version: [cyan]{version}[/cyan] ([cyan]prerelease-latest[/cyan])") + st.set_latest_experiment_model_version(version) + CONSOLE.print(f"MLflow model version: [cyan]{version}[/cyan] ([cyan]experiment-latest[/cyan])") if run_id and cfg.mlflow.mode is not MlflowMode.disabled: CONSOLE.print("Open MLflow: [cyan]qc-cli infra mlflow-url[/cyan]") diff --git a/src/state.py b/src/state.py index bd63662..c9a643f 100644 --- a/src/state.py +++ b/src/state.py @@ -48,8 +48,8 @@ class CliStateStore: state["training_jobs"] = jobs self._write(state) - def set_latest_prerelease_model_version(self, version: str) -> None: - self.update(latest_prerelease_model_version=version) + def set_latest_experiment_model_version(self, version: str) -> None: + self.update(latest_experiment_model_version=version) def _write(self, state: dict[str, Any]) -> None: with open(self.path, "w") as f: diff --git a/src/tracking/mlflow.py b/src/tracking/mlflow.py index 9275d9a..0e8f5d0 100644 --- a/src/tracking/mlflow.py +++ b/src/tracking/mlflow.py @@ -78,7 +78,9 @@ class MlflowTracker: self._log_params({f"hyperparameters.{key}": value for key, value in training_job.hyperparameters.items()}) mlflow.set_tags( { - "qc_cli.stage": "prerelease", + "qc_cli.stage": "experiment", + "qc_cli.artifact_kind": "trained_source", + "qc_cli.source": "sagemaker", "qc_cli.command": "train start", "sagemaker.job_name": training_job.job_name, } @@ -117,12 +119,14 @@ class MlflowTracker: source=training_job_status.model_artifacts, run_id=run_id, tags={ - "qc_cli.stage": "prerelease", + "qc_cli.stage": "experiment", + "qc_cli.artifact_kind": "trained_source", + "qc_cli.source": "sagemaker", "sagemaker.job_name": training_job_status.name, }, ) version_number = str(version.version) - client.set_registered_model_alias(self.registered_model_name, "prerelease-latest", version_number) + client.set_registered_model_alias(self.registered_model_name, "experiment-latest", version_number) mlflow.set_tag("qc_cli.registered_model_name", self.registered_model_name) mlflow.set_tag("qc_cli.registered_model_version", version_number) return version_number