ARTICLE AD BOX
I am developing an object detection project running on a Raspberry Pi 5.
On the camera side (Picamera2 / libcamera), everything works perfectly fine: when the camera runs alone, RAM usage is completely stable.
However, as soon as I move to the object detection stage using YOLO, a serious problem appears. Even after completely removing Ultralytics YOLO and performing inference directly with OpenVINO Runtime, the issue persists.
The critical observation is this:
Even when no image preprocessing is performed
Even when no camera frames are used
Even when I repeatedly call only infer() on a static input
RAM usage increases by tens of megabytes per second
This clearly indicates that the problem is not caused by:
color conversion
image resizing
camera buffers
NumPy allocations
OpenCV preprocessing
Instead, the issue appears to originate from OpenVINO on ARM (Raspberry Pi) not releasing memory properly.
This behavior is especially reproducible with:
OpenVINO 2025.4.1
Python 3.13
Raspberry Pi 5 (ARM64)
On a desktop PC, the same logic appears to work “fine”, but the issue is likely masked by the large amount of available RAM.
To isolate the issue, I wrote a minimal test where:
the model is loaded once
tensors are reused
no new NumPy arrays are allocated
inference is executed in a tight loop
Even in this isolation mode, RSS memory continuously increases.
At this point, I am considering:
downgrading OpenVINO
downgrading Python
or abandoning OpenVINO entirely and switching to TFLite or NCNN
Before doing that, I would like to understand:
Is this a known OpenVINO memory leak on ARM?
Is this related to Python 3.13 bindings?
Is there any recommended workaround or configuration to force memory reuse?
Below is a minimal reproducible example that demonstrates the issue.
Example RAM Leaking Code:
import sys import os import glob import time import argparse import gc import psutil import cv2 import numpy as np import openvino.runtime as ov # --- CONFIGURATION --- MODEL_DIR = "yolo11n_openvino_model" CONF_THRESHOLD = 0.50 INPUT_W, INPUT_H = 640, 640 # Model Input Dimensions CAM_W, CAM_H = 640, 480 # Camera Dimensions def get_rss_mb(): process = psutil.Process(os.getpid()) return process.memory_info().rss / 1024 / 1024 class YoloZeroAlloc: def __init__(self, model_dir): self.core = ov.Core() # Load Model xml_files = glob.glob(os.path.join(model_dir, "*.xml")) if not xml_files: raise FileNotFoundError(f"No .xml in {model_dir}") print(f"Loading: {xml_files[0]}") model = self.core.read_model(xml_files[0]) # Force Static Shape [1, 3, 640, 640] print(f"Forcing Shape: [1, 3, {INPUT_H}, {INPUT_W}]") model.reshape([1, 3, INPUT_H, INPUT_W]) self.compiled_model = self.core.compile_model(model, "CPU") self.infer_request = self.compiled_model.create_infer_request() # --- MEMORY POOLS (The Fix) --- # 1. Input Tensor (Float32, NCHW) self.input_tensor = self.infer_request.get_input_tensor() self.input_data_buffer = self.input_tensor.data # 2. Resize Buffer (Uint8, HWC) # We calculate the target size once based on aspect ratio scale = min(INPUT_W / CAM_W, INPUT_H / CAM_H) self.new_w = int(CAM_W * scale) self.new_h = int(CAM_H * scale) self.resize_buffer = np.zeros((self.new_h, self.new_w, 3), dtype=np.uint8) # 3. Canvas Buffer (Uint8, HWC) - Full 640x640 self.canvas_buffer = np.full((INPUT_H, INPUT_W, 3), 114, dtype=np.uint8) # Calculate padding offsets once self.dw = (INPUT_W - self.new_w) // 2 self.dh = (INPUT_H - self.new_h) // 2 print("Buffers Allocated. Memory Pools Ready.") def preprocess_zero_alloc(self, img_rgb): """ Resizes and pads WITHOUT allocating new numpy arrays. Uses cv2.resize(dst=...) and in-place assignments. """ # 1. Resize directly into pre-allocated buffer # This prevents creating a new 1.2MB array cv2.resize(img_rgb, (self.new_w, self.new_h), dst=self.resize_buffer) # 2. Reset Canvas (Fill with gray 114) # Faster than np.full, we just assign the value self.canvas_buffer[:] = 114 # 3. Copy resized image into canvas # Numpy handles this heavily optimized self.canvas_buffer[self.dh:self.dh+self.new_h, self.dw:self.dw+self.new_w] = self.resize_buffer # 4. Normalize and Transpose directly to Tensor # HWC -> CHW happens via transpose view (cheap) # np.divide writes result directly to OpenVINO memory (no intermediate float array) # Create a temporary view of the canvas for transposing # (Views do not allocate data memory) canvas_chw = self.canvas_buffer.transpose((2, 0, 1)) # Normalize 0-255 -> 0-1 directly into input_data_buffer np.divide(canvas_chw, 255.0, out=self.input_data_buffer[0], casting='unsafe') def infer_isolation(self): """Run inference ONLY. No preprocessing. Just math.""" self.infer_request.infer() # Retrieve result to ensure pipeline completes _ = self.infer_request.get_output_tensor().data[0, 0, 0] def infer_pipeline(self, img_rgb): """Run full zero-alloc pipeline.""" self.preprocess_zero_alloc(img_rgb) self.infer_request.infer() return self.infer_request.get_output_tensor().data # --- TEST MODES --- def run_isolation_test(): """ MODE 1: Isolation If this leaks, the OpenVINO driver is broken. If this is stable, the leak was in the Python Preprocessing. """ print("\n--- MODE: ISOLATION (No Preprocessing) ---") yolo = YoloZeroAlloc(MODEL_DIR) print("Starting Inference Loop on Static Data...") frames = 0 start = time.time() while True: try: # PURE INFERENCE yolo.infer_isolation() frames += 1 if frames % 30 == 0: rss = get_rss_mb() elapsed = time.time() - start fps = frames / elapsed print(f"ISO | T:{elapsed:.0f}s | FPS:{fps:.1f} | RAM:{rss:.1f}MB") # Manual GC every 10s just to be sure if frames % 100 == 0: gc.collect() except KeyboardInterrupt: break def run_fixed_test(): """ MODE 2: Production Fix Uses strict buffer reuse to stop the 19MB/s leak. """ print("\n--- MODE: FIXED ZERO-ALLOC PIPELINE ---") yolo = YoloZeroAlloc(MODEL_DIR) # Static dummy frame frame_rgb = np.zeros((CAM_H, CAM_W, 3), dtype=np.uint8) cv2.randu(frame_rgb, 0, 255) print("Starting Optimized Pipeline...") frames = 0 start = time.time() while True: try: # Full Pipeline with Zero Alloc Preprocess _ = yolo.infer_pipeline(frame_rgb) frames += 1 if frames % 30 == 0: rss = get_rss_mb() elapsed = time.time() - start fps = frames / elapsed print(f"FIX | T:{elapsed:.0f}s | FPS:{fps:.1f} | RAM:{rss:.1f}MB") if frames % 60 == 0: gc.collect() # Helper sweep except KeyboardInterrupt: break if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mode", choices=["isolation", "fixed"], required=True) args = parser.parse_args() if args.mode == "isolation": run_isolation_test() elif args.mode == "fixed": run_fixed_test()