TDA4VM: Unable to run real time inference of yolox_s_lite on TDA4VM

Part Number: TDA4VM

Tool/software:

I have trained the yolox_s_lite model on custom dataset using EDGEAI-MODELMAKER.

Due to local system shut down, the training was stopped at 158th epoch, so I have converted .pth file of model at 158th epoch to .onnx format using EDGEAI-MMDETECTION present inside EDGEAI-TENSORLAB repo.

Then I compiled this .onnx model with the onnxrt_ep.py file present inside the EDGEAI-TIDL-TOOLS repo. While compilation I got the error which says that "IR & OPSET version are mismatched which were before IR10 & OPSET17 , then using below script, I converted them to desired IR and OPSET version which were IR9 & OPSET12, 

"""

m = onnx.load(src)

print("Before -> IR:", m.ir_version, "opsets:",
[(imp.domain or "ai.onnx", imp.version) for imp in m.opset_import])

# Force IR version to 9 (do NOT touch opset)
m.ir_version = 9

onnx.save(m, dst)

# Optional: checker may complain about strict IR/opset pairing; you can skip it
try:
from onnx import checker
checker.check_model(dst)
print("ONNX checker passed.")
except Exception as e:
print("Checker warning:", e)

print("Saved:", dst)

 """

Then the compilation using TIDL TOOLS was successful. 

But still when I am inferecing .onnx model in the PC, it is predicting the bounding boxes correctly , but when I am uploading model, its prototxt and artifacts in the TDA4VM board, it is not able to detect the bounding boxes , only the camera based video starts running without detections ??? 

What should be the reasons??

Could you please provide the solution to it ?

For reference, below is the script that we are using for inferencing on TDA4VM board:

import os

import cv2

import numpy as np

import onnxruntime as ort

import time

import gi

gi.require_version('Gst', '1.0')

from gi.repository import Gst

 

# --- Set TIDL Environment Variables ---

os.environ["TIDL_RT_PERFSTATS"] = "1"

os.environ["TIDL_RT_LOG_LEVEL"] = "2"

 

# Initialize GStreamer

Gst.init(None)

 

# --- GStreamer OUTPUT PIPELINE ---

gst_output_pipeline = (

    "appsrc name=src is-live=true block=true format=GST_FORMAT_TIME "

    "caps=video/x-raw,format=BGR,width=1280,height=720,framerate=30/1 ! "

    "videoconvert ! queue ! waylandsink sync=false"

)

pipeline = Gst.parse_launch(gst_output_pipeline)

appsrc = pipeline.get_by_name("src")

pipeline.set_state(Gst.State.PLAYING)

 

# --- GStreamer INPUT PIPELINE ---

camera_pipeline = (

    "v4l2src device=/dev/video2 ! "

    "image/jpeg,width=1280,height=720,framerate=30/1 ! jpegdec ! "

    "videoconvert ! video/x-raw,format=BGR ! appsink"

)

cap = cv2.VideoCapture(camera_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():

    print("X ERROR: Could not open /dev/video2")

    exit(1)

 

# --- Load TIDL-compiled model ----

model_path = "/opt/model_zoo/158_onnxrt_Dataset_Buit_Over_COCO_edgeai-mmdet_yolox_s_lite__model_onnx/model/yolox_s_lite_158_ir9_opset12.onnx"

session = ort.InferenceSession(

    model_path,

    providers=["TIDLExecutionProvider", "CPUExecutionProvider"],

    provider_options=[

        {"artifacts_folder": "/opt/model_zoo/158_onnxrt_Dataset_Buit_Over_COCO_edgeai-mmdet_yolox_s_lite__model_onnx/artifacts",

         "platform": "J7"},

        {}

    ]

)

 

input_name = session.get_inputs()[0].name

output_names = [o.name for o in session.get_outputs()]

 

# --- Custom 7 classes ---

CLASSES = ["person", "bicycle", "car", "motorcycle", "bus", "truck", "rickshaw"]

 

# ---------- PREPROCESS ----------

def preprocess(image, W=640, H=640, layout="NCHW"):

    resized = cv2.resize(image, (W, H), interpolation=cv2.INTER_LINEAR)

    rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)

    if layout == "NCHW":

        blob = rgb.transpose(2, 0, 1)[None, ...]

    else:

        blob = rgb[None, ...]

    return np.ascontiguousarray(blob, dtype=np.float32)  Warning️ no /255.0

 

# ---------- NMS (NumPy implementation) ----------

def nms_numpy(boxes, scores, conf_threshold=0.3, nms_threshold=0.4):

    boxes = np.array(boxes)

    scores = np.array(scores)

 

    # Filter by confidence

    keep = scores >= conf_threshold

    boxes, scores = boxes[keep], scores[keep]

    indices = np.where(keep)[0]

 

    if len(boxes) == 0:

        return []

 

    x1 = boxes[:, 0]

    y1 = boxes[:, 1]

    x2 = boxes[:, 0] + boxes[:, 2]

    y2 = boxes[:, 1] + boxes[:, 3]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)

 

    order = scores.argsort()[::-1]

    keep_indices = []

 

    while order.size > 0:

        i = order[0]

        keep_indices.append(indices[i])

 

        xx1 = np.maximum(x1[i], x1[order[1:]])

        yy1 = np.maximum(y1[i], y1[order[1:]])

        xx2 = np.minimum(x2[i], x2[order[1:]])

        yy2 = np.minimum(y2[i], y2[order[1:]])

 

        w = np.maximum(0.0, xx2 - xx1 + 1)

        h = np.maximum(0.0, yy2 - yy1 + 1)

        inter = w * h

 

        iou = inter / (areas[i] + areas[order[1:]] - inter)

 

        inds = np.where(iou <= nms_threshold)[0]

        order = order[inds + 1]

 

    return keep_indices

 

# ---------- GENERIC DECODER ----------

def try_parse_outputs(outs):

    def looks_like_boxes(a):

        return a.ndim >= 2 and a.shape[-1] in (5, 6, 7) and np.issubdtype(a.dtype, np.floating)

 

    def looks_like_labels(a):

        return np.issubdtype(a.dtype, np.integer) and (a.ndim in (1, 2, 3))

 

    def squeeze_to_2d(a):

        a = np.array(a)

        while a.ndim > 2:

            a = a.reshape(-1, a.shape[-1])

        return a

 

    if len(outs) == 2:

        a, b = outs

        if looks_like_boxes(a) and looks_like_labels(b):

            dets, labs = squeeze_to_2d(a), squeeze_to_2d(b).reshape(-1)

        elif looks_like_boxes(b) and looks_like_labels(a):

            dets, labs = squeeze_to_2d(b), squeeze_to_2d(a).reshape(-1)

        else:

            raise RuntimeError("Cannot classify outputs.")

 

        if dets.shape[1] >= 5:

            boxes, scores = dets[:, :4], dets[:, 4]

            if dets.shape[1] >= 6:

                classes = dets[:, 5].astype(np.int32)

            else:

                classes = labs.astype(np.int32)

            return boxes, scores, classes

 

    elif len(outs) == 1:

        x = np.array(outs[0])

        while x.ndim > 2:

            x = x.reshape(-1, x.shape[-1])

        if x.shape[1] in (6, 7):

            boxes, scores, classes = x[:, :4], x[:, 4], x[:, 5].astype(np.int32)

            return boxes, scores, classes

 

    raise RuntimeError("Unexpected model outputs layout.")

 

print("White check mark Running object detection with TIDL acceleration... Press Ctrl+C to stop.")

 

try:

    while True:

        start_time = time.time()

        ret, frame = cap.read()

        if not ret:

            print("Warning️ Failed to read frame")

            continue

 

        # Inference

        outs = session.run(output_names, {input_name: preprocess(frame)})

        boxes_xyxy, scores, class_ids = try_parse_outputs(outs)

 

        # Filter + NMS

        CONF_THRESHOLD, NMS_THRESHOLD = 0.3, 0.4

        keep = scores >= CONF_THRESHOLD

        boxes_xyxy, scores, class_ids = boxes_xyxy[keep], scores[keep], class_ids[keep]

 

        boxes_xywh = boxes_xyxy.copy()

        boxes_xywh[:, 2] -= boxes_xyxy[:, 0]

        boxes_xywh[:, 3] -= boxes_xyxy[:, 1]

 

        boxes_list = boxes_xywh.astype(int).tolist()

        scores_list = scores.astype(float).tolist()

        idxs = nms_numpy(boxes_list, scores_list, CONF_THRESHOLD, NMS_THRESHOLD)

 

        # Draw detections

        sx, sy = frame.shape[1] / 640.0, frame.shape[0] / 640.0

        for i in idxs:

            x, y, w, h = boxes_list[i]

            x1, y1 = int(round(x * sx)), int(round(y * sy))

            x2, y2 = int(round((x + w) * sx)), int(round((y + h) * sy))

            cls = int(class_ids[i])

            name = CLASSES[cls] if 0 <= cls < len(CLASSES) else str(cls)

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            cv2.putText(frame, f"{name} {scores_list[i]:.2f}", (x1, max(0, y1 - 5)),

                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

            if cls == 0:

                print("label:", name, f"{scores_list[i]:.2f}", "x1:", x1, "y1:", y1, "x2:", x2, "y2:", y2)

 

        # Send to GStreamer sink

        data = frame.tobytes()

        buf = Gst.Buffer.new_allocate(None, len(data), None)

        buf.fill(0, data)

        buf.duration = Gst.util_uint64_scale_int(1, Gst.SECOND, 30)

        timestamp = int(time.time() * Gst.SECOND)

        buf.pts = buf.dts = timestamp

        appsrc.emit("push-buffer", buf)

 

        # Frame rate limiting

        elapsed = time.time() - start_time

        delay = max(0, (1 / 30) - elapsed)

        time.sleep(delay)

 

except KeyboardInterrupt:

    print("Octagonal sign Interrupted. Cleaning up...")

 

finally:

    cap.release()

    pipeline.set_state(Gst.State.NULL)