include-metrics-from-training (#6)
Reviewed-on: #6
This commit was merged in pull request #6.
This commit is contained in:
@@ -153,6 +153,20 @@ Or pass the job name explicitly:
|
||||
qc-cli train status qc-cli-YYYYMMDD-HHMMSS
|
||||
```
|
||||
|
||||
To submit the job, wait for completion, and automatically import metrics and register the model, run:
|
||||
|
||||
```bash
|
||||
qc-cli train start --upload-metrics
|
||||
```
|
||||
|
||||
The default polling interval is 30 seconds. It can be changed with `--poll-interval <seconds>`.
|
||||
|
||||
The metrics can be also submitted using:
|
||||
|
||||
```bash
|
||||
qc-cli mlflow upload-metrics
|
||||
```
|
||||
|
||||
## SageMaker Outputs
|
||||
|
||||
When the job completes, SageMaker packages the files written under `/opt/ml/model` into `model.tar.gz`.
|
||||
@@ -163,10 +177,15 @@ This example writes:
|
||||
best.pt
|
||||
model.onnx
|
||||
metrics.json
|
||||
training_metrics.json
|
||||
```
|
||||
|
||||
The archive is stored under the configured `s3.model_prefix`.
|
||||
|
||||
The `mlflow upload-metrics` command imports `training_metrics.json`, which provides per-epoch training and validation
|
||||
losses, precision, recall, mAP@0.50, mAP@0.50:0.95, and learning rates. For object detection, mAP and precision/recall
|
||||
are more meaningful than classification accuracy when assessing model quality.
|
||||
|
||||
## 6. Configure Qualcomm AI Hub
|
||||
|
||||
Authenticate with Qualcomm AI Hub:
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import Any
|
||||
|
||||
import yaml
|
||||
from sanitize_onnx import sanitize_onnx
|
||||
from training_metrics import write_training_metrics
|
||||
from ultralytics import YOLO # type: ignore[reportMissingImports]
|
||||
|
||||
|
||||
@@ -101,6 +102,7 @@ def main() -> None:
|
||||
if not trained_weights.exists():
|
||||
raise FileNotFoundError(f"Could not find trained weights in {save_dir / 'weights'}")
|
||||
|
||||
write_training_metrics(save_dir / "results.csv", model_dir / "training_metrics.json")
|
||||
copy_if_exists(trained_weights, model_dir / "best.pt")
|
||||
trained_model = YOLO(str(trained_weights))
|
||||
onnx_path = Path(trained_model.export(format="onnx", imgsz=args.imgsz))
|
||||
|
||||
82
examples/meter-detection/source/training_metrics.py
Normal file
82
examples/meter-detection/source/training_metrics.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
METRIC_NAMES = {
|
||||
"metrics/precision(B)": "val.precision",
|
||||
"metrics/recall(B)": "val.recall",
|
||||
"metrics/mAP50(B)": "val.map50",
|
||||
"metrics/mAP50-95(B)": "val.map50_95",
|
||||
"train/box_loss": "train.box_loss",
|
||||
"train/cls_loss": "train.cls_loss",
|
||||
"train/dfl_loss": "train.dfl_loss",
|
||||
"val/box_loss": "val.box_loss",
|
||||
"val/cls_loss": "val.cls_loss",
|
||||
"val/dfl_loss": "val.dfl_loss",
|
||||
"time": "train.elapsed_seconds",
|
||||
}
|
||||
|
||||
|
||||
def write_training_metrics(results_csv: Path, destination: Path) -> None:
|
||||
steps = _read_metric_steps(results_csv)
|
||||
summary = _build_summary(steps)
|
||||
payload = {
|
||||
"schema_version": 1,
|
||||
"steps": steps,
|
||||
"summary": summary,
|
||||
}
|
||||
destination.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
print(f"Saved {destination}")
|
||||
|
||||
|
||||
def _read_metric_steps(results_csv: Path) -> list[dict[str, Any]]:
|
||||
if not results_csv.is_file():
|
||||
raise FileNotFoundError(f"Could not find Ultralytics metrics history: {results_csv}")
|
||||
|
||||
steps: list[dict[str, Any]] = []
|
||||
with results_csv.open(newline="", encoding="utf-8") as csv_file:
|
||||
for row_index, raw_row in enumerate(csv.DictReader(csv_file)):
|
||||
row = {str(key).strip(): value for key, value in raw_row.items()}
|
||||
raw_epoch = row.pop("epoch", row_index)
|
||||
step = int(float(raw_epoch))
|
||||
metrics: dict[str, float] = {}
|
||||
for source_name, raw_value in row.items():
|
||||
if raw_value is None or not raw_value.strip():
|
||||
continue
|
||||
try:
|
||||
value = float(raw_value)
|
||||
except ValueError:
|
||||
continue
|
||||
if math.isfinite(value):
|
||||
metrics[METRIC_NAMES.get(source_name, _normalize_metric_name(source_name))] = value
|
||||
steps.append({"step": step, "metrics": metrics})
|
||||
return steps
|
||||
|
||||
|
||||
def _build_summary(steps: list[dict[str, Any]]) -> dict[str, float]:
|
||||
if not steps:
|
||||
return {}
|
||||
|
||||
summary: dict[str, float] = {}
|
||||
final_step = steps[-1]
|
||||
summary["summary.final_epoch"] = float(final_step["step"])
|
||||
for name, value in final_step["metrics"].items():
|
||||
summary[f"summary.final.{name}"] = value
|
||||
|
||||
scored_steps = [step for step in steps if "val.map50_95" in step["metrics"]]
|
||||
if scored_steps:
|
||||
best_step = max(scored_steps, key=lambda step: step["metrics"]["val.map50_95"])
|
||||
summary["summary.best_epoch"] = float(best_step["step"])
|
||||
summary["summary.best_val.map50_95"] = best_step["metrics"]["val.map50_95"]
|
||||
if "val.map50" in best_step["metrics"]:
|
||||
summary["summary.best_val.map50"] = best_step["metrics"]["val.map50"]
|
||||
return summary
|
||||
|
||||
|
||||
def _normalize_metric_name(name: str) -> str:
|
||||
normalized = name.replace("/", ".")
|
||||
normalized = re.sub(r"[^A-Za-z0-9_.-]+", "_", normalized)
|
||||
return normalized.strip("._") or "unnamed"
|
||||
Reference in New Issue
Block a user