make sure resources are set up in isolated namespaces #1

Merged
slalom merged 1 commits from cdk-changes into main 2026-05-27 12:51:27 +00:00
11 changed files with 184 additions and 36 deletions

2
.gitignore vendored
View File

@@ -218,7 +218,7 @@ __marimo__/
.streamlit/secrets.toml .streamlit/secrets.toml
.venv/ .venv/
config.yaml config*.yaml
cdk.out/ cdk.out/
.qc-cli*.json .qc-cli*.json
examples/*/data/ examples/*/data/

View File

@@ -30,7 +30,7 @@ qc-cli --help
# 1. Create config.yaml in the current directory # 1. Create config.yaml in the current directory
qc-cli init qc-cli init
# 2. Edit config.yaml — at minimum set s3.bucket and sagemaker.training.image_uri # 2. Edit config.yaml — at minimum set sagemaker.training.image_uri
# 3. Provision AWS infrastructure (S3 bucket + SageMaker IAM role). # 3. Provision AWS infrastructure (S3 bucket + SageMaker IAM role).
# This is the step that requires the AWS CDK CLI. # This is the step that requires the AWS CDK CLI.
@@ -47,15 +47,17 @@ qc-cli train status
`qc-cli init` writes a `config.yaml` in the current directory. The fields you must fill in before using the tool: `qc-cli init` writes a `config.yaml` in the current directory. The fields you must fill in before using the tool:
```yaml ```yaml
infra:
stack_name: qc-cli-mlops-1a2b3c4d5e6f
aws: aws:
region: us-east-1 region: us-east-1
profile: default # AWS CLI profile name profile: default # AWS CLI profile name
s3: s3:
bucket: your-unique-bucket-name bucket: qc-cli-mlops-1a2b3c4d5e6f-data
sagemaker: sagemaker:
role_name: qc-cli-sagemaker-role
training: training:
image_uri: "" # ECR URI for your training container image_uri: "" # ECR URI for your training container
instance_type: ml.m5.xlarge instance_type: ml.m5.xlarge
@@ -65,6 +67,10 @@ sagemaker:
hyperparameters: {} hyperparameters: {}
``` ```
`qc-cli init` generates the `infra.stack_name` and `s3.bucket` namespace once and writes it to `config.yaml`. Keep these values stable for a deployment; changing them points the CLI at different infrastructure.
The CLI isolates both application resources and CDK bootstrap resources. The application CloudFormation stack uses `infra.stack_name`, the S3 bucket uses the same generated namespace because bucket names are globally unique, and the SageMaker IAM role uses a CloudFormation-generated physical name. CDK bootstrap resources are derived internally from `infra.stack_name`, including a bootstrap stack named `<stack_name>-bootstrap` and a matching non-default CDK asset bucket qualifier. `qc-cli infra destroy` removes the application stack but leaves the CDK bootstrap stack in place; the command prints the retained bootstrap stack name.
`hyperparameters` is a flat map of values passed to the training container. Valid keys depend on the selected training image and entry point. `hyperparameters` is a flat map of values passed to the training container. Valid keys depend on the selected training image and entry point.
To provision an MLflow tracking server, set: To provision an MLflow tracking server, set:
@@ -105,6 +111,12 @@ qc-cli infra destroy --yes Destroy stack without confirmation
qc-cli infra destroy --delete-bucket-data Destroy stack and delete S3 data qc-cli infra destroy --delete-bucket-data Destroy stack and delete S3 data
``` ```
`--cloudformation-execution-policy` is a one-time CDK bootstrap option, not a `config.yaml` setting. Pass it on `infra setup` when you need the CDK bootstrap CloudFormation execution role to use a policy other than the default `AdministratorAccess`:
```bash
qc-cli infra setup --cloudformation-execution-policy arn:aws:iam::aws:policy/PowerUserAccess
```
### `upload` ### `upload`
``` ```

4
app.py
View File

@@ -8,17 +8,19 @@ from src.infra.stack import QCStack
app = cdk.App() app = cdk.App()
config_path = app.node.try_get_context("config") or "config.yaml" config_path = app.node.try_get_context("config") or "config.yaml"
stack_name = app.node.try_get_context("stack_name") or "MLOpsStack"
account_id = app.node.try_get_context("account_id") or os.getenv("CDK_DEFAULT_ACCOUNT") account_id = app.node.try_get_context("account_id") or os.getenv("CDK_DEFAULT_ACCOUNT")
delete_bucket_data = str(app.node.try_get_context("delete_bucket_data") or "false").lower() == "true" delete_bucket_data = str(app.node.try_get_context("delete_bucket_data") or "false").lower() == "true"
cfg = load_config(config_path) cfg = load_config(config_path)
stack_name = app.node.try_get_context("stack_name") or cfg.infra.stack_name
bootstrap_qualifier = app.node.try_get_context("bootstrap_qualifier") or cfg.infra.effective_bootstrap_qualifier
QCStack( QCStack(
app, app,
stack_name, stack_name,
config=cfg, config=cfg,
delete_bucket_data=delete_bucket_data, delete_bucket_data=delete_bucket_data,
synthesizer=cdk.DefaultStackSynthesizer(qualifier=bootstrap_qualifier),
env=cdk.Environment( env=cdk.Environment(
account=account_id, account=account_id,
region=cfg.aws.region, region=cfg.aws.region,

View File

@@ -13,7 +13,6 @@ s3:
bucket: your-bucket-name bucket: your-bucket-name
sagemaker: sagemaker:
role_name: <role-name>
training: training:
image_uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.6-cpu-py312-ubuntu22.04-sagemaker-v1 image_uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.6-cpu-py312-ubuntu22.04-sagemaker-v1
instance_type: ml.m4.xlarge instance_type: ml.m4.xlarge

View File

@@ -3,13 +3,11 @@ from typing import Any
import boto3 import boto3
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from src.infra.provisioning import STACK_NAME
def stack_status(region: str, profile: str, stack_name: str) -> dict[str, Any] | None:
def stack_status(region: str, profile: str) -> dict[str, Any] | None:
client = boto3.Session(profile_name=profile, region_name=region).client("cloudformation") client = boto3.Session(profile_name=profile, region_name=region).client("cloudformation")
try: try:
stack = client.describe_stacks(StackName=STACK_NAME)["Stacks"][0] stack = client.describe_stacks(StackName=stack_name)["Stacks"][0]
except ClientError as e: except ClientError as e:
message = e.response.get("Error", {}).get("Message", "") message = e.response.get("Error", {}).get("Message", "")
if "does not exist" in message: if "does not exist" in message:

View File

@@ -51,6 +51,8 @@ def setup(
profile=cfg.aws.profile, profile=cfg.aws.profile,
account_id=account_id, account_id=account_id,
region=cfg.aws.region, region=cfg.aws.region,
bootstrap_qualifier=cfg.infra.effective_bootstrap_qualifier,
toolkit_stack_name=cfg.infra.effective_toolkit_stack_name,
cloudformation_execution_policy=cloudformation_execution_policy, cloudformation_execution_policy=cloudformation_execution_policy,
) )
with CONSOLE.status("Running cdk deploy..."): with CONSOLE.status("Running cdk deploy..."):
@@ -58,6 +60,9 @@ def setup(
profile=cfg.aws.profile, profile=cfg.aws.profile,
account_id=account_id, account_id=account_id,
region=cfg.aws.region, region=cfg.aws.region,
stack_name=cfg.infra.stack_name,
bootstrap_qualifier=cfg.infra.effective_bootstrap_qualifier,
toolkit_stack_name=cfg.infra.effective_toolkit_stack_name,
config_path=config, config_path=config,
config_dir=str(Path(config).parent), config_dir=str(Path(config).parent),
config_snapshot=cfg.model_dump(mode="json"), config_snapshot=cfg.model_dump(mode="json"),
@@ -82,7 +87,7 @@ def setup(
def status(config: str = CONFIG_OPT) -> None: def status(config: str = CONFIG_OPT) -> None:
"""Show current infrastructure status.""" """Show current infrastructure status."""
cfg = load_cfg(config) cfg = load_cfg(config)
stack = cloudformation.stack_status(cfg.aws.region, cfg.aws.profile) stack = cloudformation.stack_status(cfg.aws.region, cfg.aws.profile, cfg.infra.stack_name)
table = Table(title="Infrastructure Status") table = Table(title="Infrastructure Status")
table.add_column("Resource", style="cyan") table.add_column("Resource", style="cyan")
@@ -91,7 +96,7 @@ def status(config: str = CONFIG_OPT) -> None:
table.add_column("ARN / URI") table.add_column("ARN / URI")
if not stack: if not stack:
table.add_row("CDK Stack", provisioning.STACK_NAME, "[red]missing[/red]", "-") table.add_row("CDK Stack", cfg.infra.stack_name, "[red]missing[/red]", "-")
table.add_row("S3 Bucket", cfg.s3.bucket, "[red]unknown[/red]", "-") table.add_row("S3 Bucket", cfg.s3.bucket, "[red]unknown[/red]", "-")
table.add_row("IAM Role", cfg.sagemaker.role_name, "[red]unknown[/red]", "-") table.add_row("IAM Role", cfg.sagemaker.role_name, "[red]unknown[/red]", "-")
if cfg.mlflow.mode is not MlflowMode.disabled: if cfg.mlflow.mode is not MlflowMode.disabled:
@@ -114,7 +119,7 @@ def status(config: str = CONFIG_OPT) -> None:
) )
table.add_row( table.add_row(
"IAM Role", "IAM Role",
cfg.sagemaker.role_name, _role_name(cfg.sagemaker.role_name, outputs.get("SageMakerRoleArn", "")),
"[green]managed[/green]", "[green]managed[/green]",
outputs.get("SageMakerRoleArn", "-"), outputs.get("SageMakerRoleArn", "-"),
) )
@@ -156,10 +161,13 @@ def destroy(
) -> None: ) -> None:
"""Destroy the CDK stack.""" """Destroy the CDK stack."""
cfg = _destroy_config(config) cfg = _destroy_config(config)
stack_name = _destroy_stack_name(config, cfg)
bootstrap_qualifier = _destroy_bootstrap_qualifier(config, cfg)
toolkit_stack_name = _destroy_toolkit_stack_name(config, cfg)
if not yes and not delete_bucket_data: if not yes and not delete_bucket_data:
typer.confirm( typer.confirm(
f"Destroy CDK stack '{provisioning.STACK_NAME}' while retaining S3 bucket data?", f"Destroy CDK stack '{stack_name}' while retaining S3 bucket data?",
abort=True, abort=True,
) )
@@ -172,13 +180,17 @@ def destroy(
provisioning.destroy( provisioning.destroy(
profile=cfg.aws.profile, profile=cfg.aws.profile,
account_id=account_id, account_id=account_id,
stack_name=stack_name,
bootstrap_qualifier=bootstrap_qualifier,
toolkit_stack_name=toolkit_stack_name,
config_path=str(snapshot_path), config_path=str(snapshot_path),
delete_bucket_data=delete_bucket_data, delete_bucket_data=delete_bucket_data,
) )
except RuntimeError as e: except RuntimeError as e:
CONSOLE.print(f"[red]{e}[/red]") CONSOLE.print(f"[red]{e}[/red]")
raise typer.Exit(1) raise typer.Exit(1)
CONSOLE.print(f"[green]✓[/green] Destroyed stack: {provisioning.STACK_NAME}") CONSOLE.print(f"[green]✓[/green] Destroyed stack: {stack_name}")
CONSOLE.print(f"[yellow]CDK bootstrap stack retained: {toolkit_stack_name}[/yellow]")
def _destroy_config(config_path: str) -> Config: def _destroy_config(config_path: str) -> Config:
@@ -190,6 +202,13 @@ def _destroy_config(config_path: str) -> Config:
return load_cfg(config_path) return load_cfg(config_path)
def _role_name(configured_name: str, role_arn: str) -> str:
if configured_name:
return configured_name
if role_arn:
return role_arn.rsplit("/", 1)[-1]
return "-"
def _destroy_account_id(config_path: str, cfg: Config) -> str: def _destroy_account_id(config_path: str, cfg: Config) -> str:
config_dir = str(Path(config_path).parent) config_dir = str(Path(config_path).parent)
state = read_infra_state(config_dir) state = read_infra_state(config_dir)
@@ -197,3 +216,30 @@ def _destroy_account_id(config_path: str, cfg: Config) -> str:
if account_id: if account_id:
return str(account_id) return str(account_id)
return identity.account_id(cfg.aws.region, cfg.aws.profile) return identity.account_id(cfg.aws.region, cfg.aws.profile)
def _destroy_stack_name(config_path: str, cfg: Config) -> str:
config_dir = str(Path(config_path).parent)
state = read_infra_state(config_dir)
stack_name = state.get("stack_name")
if stack_name:
return str(stack_name)
return cfg.infra.stack_name
def _destroy_bootstrap_qualifier(config_path: str, cfg: Config) -> str:
config_dir = str(Path(config_path).parent)
state = read_infra_state(config_dir)
bootstrap_qualifier = state.get("bootstrap_qualifier")
if bootstrap_qualifier:
return str(bootstrap_qualifier)
return cfg.infra.effective_bootstrap_qualifier
def _destroy_toolkit_stack_name(config_path: str, cfg: Config) -> str:
config_dir = str(Path(config_path).parent)
state = read_infra_state(config_dir)
toolkit_stack_name = state.get("toolkit_stack_name")
if toolkit_stack_name:
return str(toolkit_stack_name)
return cfg.infra.effective_toolkit_stack_name

View File

@@ -1,4 +1,5 @@
from datetime import datetime from datetime import datetime
from pathlib import Path
import typer import typer
from rich.table import Table from rich.table import Table
@@ -7,6 +8,8 @@ from src import state as state_ops
from src.aws import iam from src.aws import iam
from src.aws import sagemaker as sm_ops from src.aws import sagemaker as sm_ops
from src.commands.utils import CONFIG_OPT, CONSOLE, load_cfg from src.commands.utils import CONFIG_OPT, CONSOLE, load_cfg
from src.config import Config
from src.infra.state import read_infra_state
app = typer.Typer(help="Manage SageMaker training jobs") app = typer.Typer(help="Manage SageMaker training jobs")
@@ -20,10 +23,22 @@ _STATUS_COLOR = {
def _config_dir(config_path: str) -> str: def _config_dir(config_path: str) -> str:
from pathlib import Path
return str(Path(config_path).parent) return str(Path(config_path).parent)
def _sagemaker_role_arn(config_path: str, cfg: Config) -> str:
state = read_infra_state(_config_dir(config_path))
role_arn = state.get("outputs", {}).get("SageMakerRoleArn")
if role_arn:
return str(role_arn)
if cfg.sagemaker.role_name:
role_arn = iam.get_role_arn(cfg.aws.profile, cfg.sagemaker.role_name)
if role_arn:
return role_arn
raise RuntimeError(f"IAM role '{cfg.sagemaker.role_name}' not found. Run 'qc-cli infra setup' first.")
raise RuntimeError("SageMaker role not found in infra state. Run 'qc-cli infra setup' first.")
@app.command() @app.command()
def start(config: str = CONFIG_OPT) -> None: def start(config: str = CONFIG_OPT) -> None:
"""Submit a SageMaker training job.""" """Submit a SageMaker training job."""
@@ -37,9 +52,10 @@ def start(config: str = CONFIG_OPT) -> None:
) )
raise typer.Exit(1) raise typer.Exit(1)
role_arn = iam.get_role_arn(cfg.aws.profile, cfg.sagemaker.role_name) try:
if not role_arn: role_arn = _sagemaker_role_arn(config, cfg)
CONSOLE.print(f"[red]IAM role '{cfg.sagemaker.role_name}' not found. Run 'qc-cli infra setup' first.[/red]") except RuntimeError as e:
CONSOLE.print(f"[red]{e}[/red]")
raise typer.Exit(1) raise typer.Exit(1)
job_name = f"qc-cli-{datetime.now().strftime('%Y%m%d-%H%M%S')}" job_name = f"qc-cli-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

View File

@@ -1,3 +1,4 @@
import re
from enum import Enum from enum import Enum
from typing import Any, Literal, TypedDict from typing import Any, Literal, TypedDict
@@ -32,6 +33,33 @@ class AwsConfig(BaseModel):
return {"profile_name": self.profile, "region_name": self.region} return {"profile_name": self.profile, "region_name": self.region}
DEFAULT_BOOTSTRAP_QUALIFIER = "hnb659fds"
GENERATED_STACK_PREFIX = "qc-cli-mlops-"
class InfraConfig(BaseModel):
stack_name: str
@property
def effective_bootstrap_qualifier(self) -> str:
sanitized = re.sub(r"[^a-z0-9]", "", self.stack_name.lower())
if not sanitized:
return DEFAULT_BOOTSTRAP_QUALIFIER
if self.stack_name.startswith(GENERATED_STACK_PREFIX):
suffix = re.sub(r"[^a-z0-9]", "", self.stack_name.removeprefix(GENERATED_STACK_PREFIX).lower())
if suffix:
return f"q{suffix}"[:10]
return f"q{sanitized}"[:10]
@property
def effective_toolkit_stack_name(self) -> str:
if self.stack_name.startswith(GENERATED_STACK_PREFIX):
suffix = re.sub(r"[^A-Za-z0-9-]", "", self.stack_name.removeprefix(GENERATED_STACK_PREFIX))
if suffix:
return f"{self.stack_name}-bootstrap"
return f"{self.stack_name}-bootstrap"
class S3Config(BaseModel): class S3Config(BaseModel):
bucket: str = "my-qc-mlops-bucket" bucket: str = "my-qc-mlops-bucket"
data_prefix: str = "data/" data_prefix: str = "data/"
@@ -48,7 +76,7 @@ class TrainingConfig(BaseModel):
class SageMakerConfig(BaseModel): class SageMakerConfig(BaseModel):
role_name: str = "qc-cli-sagemaker-role" role_name: str = ""
training: TrainingConfig = Field(default_factory=TrainingConfig) training: TrainingConfig = Field(default_factory=TrainingConfig)
@@ -69,6 +97,7 @@ class MlflowConfig(BaseModel):
class Config(BaseModel): class Config(BaseModel):
infra: InfraConfig
aws: AwsConfig = Field(default_factory=AwsConfig) aws: AwsConfig = Field(default_factory=AwsConfig)
s3: S3Config = Field(default_factory=S3Config) s3: S3Config = Field(default_factory=S3Config)
sagemaker: SageMakerConfig = Field(default_factory=SageMakerConfig) sagemaker: SageMakerConfig = Field(default_factory=SageMakerConfig)

View File

@@ -5,17 +5,27 @@ from typing import Any
from src.infra.state import state_path, write_infra_state from src.infra.state import state_path, write_infra_state
STACK_NAME = "MLOpsStack"
def bootstrap( def bootstrap(
*, *,
profile: str, profile: str,
account_id: str, account_id: str,
region: str, region: str,
bootstrap_qualifier: str,
toolkit_stack_name: str,
cloudformation_execution_policy: str | None = None, cloudformation_execution_policy: str | None = None,
) -> None: ) -> None:
cmd = ["cdk", "bootstrap", f"aws://{account_id}/{region}", "--profile", profile] cmd = [
"cdk",
"bootstrap",
f"aws://{account_id}/{region}",
"--profile",
profile,
"--qualifier",
bootstrap_qualifier,
"--toolkit-stack-name",
toolkit_stack_name,
]
if cloudformation_execution_policy: if cloudformation_execution_policy:
cmd.extend(["--cloudformation-execution-policies", cloudformation_execution_policy]) cmd.extend(["--cloudformation-execution-policies", cloudformation_execution_policy])
_run(cmd) _run(cmd)
@@ -26,6 +36,9 @@ def deploy(
profile: str, profile: str,
account_id: str, account_id: str,
region: str, region: str,
stack_name: str,
bootstrap_qualifier: str,
toolkit_stack_name: str,
config_path: str, config_path: str,
config_dir: str, config_dir: str,
config_snapshot: dict[str, Any], config_snapshot: dict[str, Any],
@@ -35,19 +48,24 @@ def deploy(
"deploy", "deploy",
profile=profile, profile=profile,
account_id=account_id, account_id=account_id,
stack_name=stack_name,
bootstrap_qualifier=bootstrap_qualifier,
toolkit_stack_name=toolkit_stack_name,
config_path=config_path, config_path=config_path,
delete_bucket_data=False, delete_bucket_data=False,
) + ["--require-approval", "never", "--outputs-file", str(outputs_file)] ) + ["--require-approval", "never", "--outputs-file", str(outputs_file)]
_run(cmd) _run(cmd)
outputs = _read_outputs(outputs_file) outputs = _read_outputs(outputs_file, stack_name)
state = { state = {
"stack_name": STACK_NAME, "stack_name": stack_name,
"aws": { "aws": {
"account_id": account_id, "account_id": account_id,
"region": region, "region": region,
"profile": profile, "profile": profile,
}, },
"bootstrap_qualifier": bootstrap_qualifier,
"toolkit_stack_name": toolkit_stack_name,
"config": config_snapshot, "config": config_snapshot,
"outputs": outputs, "outputs": outputs,
} }
@@ -59,6 +77,9 @@ def destroy(
*, *,
profile: str, profile: str,
account_id: str, account_id: str,
stack_name: str,
bootstrap_qualifier: str,
toolkit_stack_name: str,
config_path: str, config_path: str,
delete_bucket_data: bool, delete_bucket_data: bool,
) -> None: ) -> None:
@@ -67,6 +88,9 @@ def destroy(
"deploy", "deploy",
profile=profile, profile=profile,
account_id=account_id, account_id=account_id,
stack_name=stack_name,
bootstrap_qualifier=bootstrap_qualifier,
toolkit_stack_name=toolkit_stack_name,
config_path=config_path, config_path=config_path,
delete_bucket_data=True, delete_bucket_data=True,
) + ["--require-approval", "never"] ) + ["--require-approval", "never"]
@@ -76,6 +100,9 @@ def destroy(
"destroy", "destroy",
profile=profile, profile=profile,
account_id=account_id, account_id=account_id,
stack_name=stack_name,
bootstrap_qualifier=bootstrap_qualifier,
toolkit_stack_name=toolkit_stack_name,
config_path=config_path, config_path=config_path,
delete_bucket_data=delete_bucket_data, delete_bucket_data=delete_bucket_data,
) + ["--force"] ) + ["--force"]
@@ -87,26 +114,35 @@ def _cdk_cmd(
*, *,
profile: str, profile: str,
account_id: str, account_id: str,
stack_name: str,
bootstrap_qualifier: str,
toolkit_stack_name: str,
config_path: str, config_path: str,
delete_bucket_data: bool, delete_bucket_data: bool,
) -> list[str]: ) -> list[str]:
cmd = [ cmd = [
"cdk", "cdk",
action, action,
STACK_NAME, stack_name,
"--app", "--app",
"python app.py", "python app.py",
"--profile", "--profile",
profile, profile,
]
if action == "deploy":
cmd.extend(["--toolkit-stack-name", toolkit_stack_name])
cmd.extend([
"-c", "-c",
f"account_id={account_id}", f"account_id={account_id}",
"-c", "-c",
f"config={config_path}", f"config={config_path}",
"-c", "-c",
f"stack_name={STACK_NAME}", f"stack_name={stack_name}",
"-c",
f"bootstrap_qualifier={bootstrap_qualifier}",
"-c", "-c",
f"delete_bucket_data={str(delete_bucket_data).lower()}", f"delete_bucket_data={str(delete_bucket_data).lower()}",
] ])
return cmd return cmd
@@ -119,9 +155,9 @@ def _run(cmd: list[str]) -> None:
raise RuntimeError(f"CDK command failed with exit code {e.returncode}.") from e raise RuntimeError(f"CDK command failed with exit code {e.returncode}.") from e
def _read_outputs(path: Path) -> dict[str, str]: def _read_outputs(path: Path, stack_name: str) -> dict[str, str]:
if not path.exists(): if not path.exists():
return {} return {}
with open(path) as f: with open(path) as f:
data = json.load(f) data = json.load(f)
return data.get(STACK_NAME, {}) return data.get(stack_name, {})

View File

@@ -34,7 +34,7 @@ class QCStack(Stack):
role = iam.CfnRole( role = iam.CfnRole(
self, self,
"SageMakerRole", "SageMakerRole",
role_name=config.sagemaker.role_name, role_name=config.sagemaker.role_name or None,
assume_role_policy_document=self._sagemaker_trust_policy(), assume_role_policy_document=self._sagemaker_trust_policy(),
managed_policy_arns=[ managed_policy_arns=[
f"arn:{self.partition}:iam::aws:policy/AmazonSageMakerFullAccess", f"arn:{self.partition}:iam::aws:policy/AmazonSageMakerFullAccess",

View File

@@ -1,3 +1,4 @@
import secrets
from pathlib import Path from pathlib import Path
import typer import typer
@@ -8,7 +9,7 @@ from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn
from src.aws import s3 as s3_ops from src.aws import s3 as s3_ops
from src.commands import infra, train from src.commands import infra, train
from src.commands.utils import CONFIG_OPT, load_cfg from src.commands.utils import CONFIG_OPT, load_cfg
from src.config import Config from src.config import GENERATED_STACK_PREFIX, Config, InfraConfig, S3Config
app = typer.Typer( app = typer.Typer(
help="qc-cli: End-to-end model managment for Qualcomm AI Hub.", help="qc-cli: End-to-end model managment for Qualcomm AI Hub.",
@@ -31,18 +32,27 @@ def init(
console.print(f"[yellow]{dest} already exists.[/yellow] Use --force to overwrite.") console.print(f"[yellow]{dest} already exists.[/yellow] Use --force to overwrite.")
raise typer.Exit(1) raise typer.Exit(1)
config = Config() config = _new_isolated_config()
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
config_data = config.model_dump(mode="json")
config_data["sagemaker"].pop("role_name", None)
with open(dest, "w") as f: with open(dest, "w") as f:
yaml.safe_dump(config.model_dump(mode="json"), f, sort_keys=False) yaml.safe_dump(config_data, f, sort_keys=False)
console.print(f"[green]✓[/green] Config written to [bold]{dest}[/bold]") console.print(f"[green]✓[/green] Config written to [bold]{dest}[/bold]")
console.print( console.print(
"Edit it (especially [cyan]s3.bucket[/cyan] and [cyan]sagemaker.training.image_uri[/cyan]) " "Edit [cyan]sagemaker.training.image_uri[/cyan] before running training commands."
"before running other commands."
) )
def _new_isolated_config() -> Config:
suffix = secrets.token_hex(6)
namespace = f"{GENERATED_STACK_PREFIX}{suffix}"
config = Config(infra=InfraConfig(stack_name=namespace))
config.s3 = S3Config(bucket=f"{namespace}-data")
return config
@app.command() @app.command()
def upload( def upload(
path: Path = typer.Argument(..., help="Local file or directory to upload"), path: Path = typer.Argument(..., help="Local file or directory to upload"),