include-metrics-from-training (#6)

Reviewed-on: #6
2026-06-12 18:23:25 +00:00
parent 522ddc74e2
commit a1ffbb77c5
13 changed files with 785 additions and 116 deletions
--- a/examples/meter-detection/README.md
+++ b/examples/meter-detection/README.md
@@ -153,6 +153,20 @@ Or pass the job name explicitly:
 qc-cli train status qc-cli-YYYYMMDD-HHMMSS
 ```

+To submit the job, wait for completion, and automatically import metrics and register the model, run:
+
+```bash
+qc-cli train start --upload-metrics
+```
+
+The default polling interval is 30 seconds. It can be changed with `--poll-interval <seconds>`.
+
+The metrics can be also submitted using:
+
+```bash
+qc-cli mlflow upload-metrics
+```
+
 ## SageMaker Outputs

 When the job completes, SageMaker packages the files written under `/opt/ml/model` into `model.tar.gz`.
@@ -163,10 +177,15 @@ This example writes:
 best.pt
 model.onnx
 metrics.json
+training_metrics.json
 ```

 The archive is stored under the configured `s3.model_prefix`.

+The `mlflow upload-metrics` command imports `training_metrics.json`, which provides per-epoch training and validation
+losses, precision, recall, mAP@0.50, mAP@0.50:0.95, and learning rates. For object detection, mAP and precision/recall
+are more meaningful than classification accuracy when assessing model quality.
+
 ## 6. Configure Qualcomm AI Hub

 Authenticate with Qualcomm AI Hub:
--- a/examples/meter-detection/source/train.py
+++ b/examples/meter-detection/source/train.py
@@ -12,6 +12,7 @@ from typing import Any

 import yaml
 from sanitize_onnx import sanitize_onnx
+from training_metrics import write_training_metrics
 from ultralytics import YOLO  # type: ignore[reportMissingImports]


@@ -101,6 +102,7 @@ def main() -> None:
    if not trained_weights.exists():
        raise FileNotFoundError(f"Could not find trained weights in {save_dir / 'weights'}")

+    write_training_metrics(save_dir / "results.csv", model_dir / "training_metrics.json")
    copy_if_exists(trained_weights, model_dir / "best.pt")
    trained_model = YOLO(str(trained_weights))
    onnx_path = Path(trained_model.export(format="onnx", imgsz=args.imgsz))
--- a/examples/meter-detection/source/training_metrics.py
+++ b/examples/meter-detection/source/training_metrics.py
@@ -0,0 +1,82 @@
+import csv
+import json
+import math
+import re
+from pathlib import Path
+from typing import Any
+
+METRIC_NAMES = {
+    "metrics/precision(B)": "val.precision",
+    "metrics/recall(B)": "val.recall",
+    "metrics/mAP50(B)": "val.map50",
+    "metrics/mAP50-95(B)": "val.map50_95",
+    "train/box_loss": "train.box_loss",
+    "train/cls_loss": "train.cls_loss",
+    "train/dfl_loss": "train.dfl_loss",
+    "val/box_loss": "val.box_loss",
+    "val/cls_loss": "val.cls_loss",
+    "val/dfl_loss": "val.dfl_loss",
+    "time": "train.elapsed_seconds",
+}
+
+
+def write_training_metrics(results_csv: Path, destination: Path) -> None:
+    steps = _read_metric_steps(results_csv)
+    summary = _build_summary(steps)
+    payload = {
+        "schema_version": 1,
+        "steps": steps,
+        "summary": summary,
+    }
+    destination.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    print(f"Saved {destination}")
+
+
+def _read_metric_steps(results_csv: Path) -> list[dict[str, Any]]:
+    if not results_csv.is_file():
+        raise FileNotFoundError(f"Could not find Ultralytics metrics history: {results_csv}")
+
+    steps: list[dict[str, Any]] = []
+    with results_csv.open(newline="", encoding="utf-8") as csv_file:
+        for row_index, raw_row in enumerate(csv.DictReader(csv_file)):
+            row = {str(key).strip(): value for key, value in raw_row.items()}
+            raw_epoch = row.pop("epoch", row_index)
+            step = int(float(raw_epoch))
+            metrics: dict[str, float] = {}
+            for source_name, raw_value in row.items():
+                if raw_value is None or not raw_value.strip():
+                    continue
+                try:
+                    value = float(raw_value)
+                except ValueError:
+                    continue
+                if math.isfinite(value):
+                    metrics[METRIC_NAMES.get(source_name, _normalize_metric_name(source_name))] = value
+            steps.append({"step": step, "metrics": metrics})
+    return steps
+
+
+def _build_summary(steps: list[dict[str, Any]]) -> dict[str, float]:
+    if not steps:
+        return {}
+
+    summary: dict[str, float] = {}
+    final_step = steps[-1]
+    summary["summary.final_epoch"] = float(final_step["step"])
+    for name, value in final_step["metrics"].items():
+        summary[f"summary.final.{name}"] = value
+
+    scored_steps = [step for step in steps if "val.map50_95" in step["metrics"]]
+    if scored_steps:
+        best_step = max(scored_steps, key=lambda step: step["metrics"]["val.map50_95"])
+        summary["summary.best_epoch"] = float(best_step["step"])
+        summary["summary.best_val.map50_95"] = best_step["metrics"]["val.map50_95"]
+        if "val.map50" in best_step["metrics"]:
+            summary["summary.best_val.map50"] = best_step["metrics"]["val.map50"]
+    return summary
+
+
+def _normalize_metric_name(name: str) -> str:
+    normalized = name.replace("/", ".")
+    normalized = re.sub(r"[^A-Za-z0-9_.-]+", "_", normalized)
+    return normalized.strip("._") or "unnamed"