Using the Cortex M4 Core

Using the Cortex M4 Core on Kontron i.MX8MM Boards

The following guide uses U-Boot or Linux to load a binary compiled with the NXP MCUXpresso SDK into the M4 core of the i.MX8MM and start it. The application on the M4 is able to communicate with the Linux system through a custom rpmsg client driver kernel module.

1. Compiling an Application for the M4 Core

Please follow the guide in this blog post from i.MX guru Detlev Zundel to setup the SDK and compile a demo application.

The demo applications for the NXP i.MX8MM Evaluation Kit (EVK) can also be used on the Kontron boards. You might want to adjust the the peripherals and pinmux settings to match your hardware.

In the following we will use two different example apps:

  1. Example 1: The "hello_world" example for creating a simple UART console. We will load this app via U-Boot.
  2. Example 2: The "rpmsg_lite_pingpong_rtos" example to demonstrate the intercore communication. We will load this app via Linux.

In order to receive log messages from the M4 core, we need access to an additional UART (not the one used by Linux). In our example we will use the RS232 interface (UART1) on the Kontron board as the M4 debug console.

To switch from UART4 used on the NXP i.MX8MM EVK to UART1, the following changes in the source code are required.

Demo Source Code Changes for using UART1 as M4 console
--- a/evkmimx8mm/demo_apps/hello_world/pin_mux.c
+++ b/evkmimx8mm/demo_apps/hello_world/pin_mux.c
@@ -55,14 +55,14 @@ BOARD_InitPins:
  *
  * END ****************************************************************************************************************/
 void BOARD_InitPins(void) {                                /*!< Function assigned for the core: Cortex-M4[m4] */
-    IOMUXC_SetPinMux(IOMUXC_UART4_RXD_UART4_RX, 0U);
-    IOMUXC_SetPinConfig(IOMUXC_UART4_RXD_UART4_RX, 
+    IOMUXC_SetPinMux(IOMUXC_SAI2_RXC_UART1_RX, 0U);
+    IOMUXC_SetPinConfig(IOMUXC_SAI2_RXC_UART1_RX, 
                         IOMUXC_SW_PAD_CTL_PAD_DSE(6U) |
                         IOMUXC_SW_PAD_CTL_PAD_FSEL(2U));
-    IOMUXC_SetPinMux(IOMUXC_UART4_TXD_UART4_TX, 0U);
-    IOMUXC_SetPinConfig(IOMUXC_UART4_TXD_UART4_TX, 
+    IOMUXC_SetPinMux(IOMUXC_SAI2_RXFS_UART1_TX, 0U);
+    IOMUXC_SetPinConfig(IOMUXC_SAI2_RXFS_UART1_TX, 
                         IOMUXC_SW_PAD_CTL_PAD_DSE(6U) |
                         IOMUXC_SW_PAD_CTL_PAD_FSEL(2U));

--- a/evkmimx8mm/multicore_examples/rpmsg_lite_pingpong_rtos/linux_remote/pin_mux.c
+++ b/evkmimx8mm/multicore_examples/rpmsg_lite_pingpong_rtos/linux_remote/pin_mux.c
@@ -55,14 +55,14 @@ BOARD_InitPins:
  *
  * END ****************************************************************************************************************/
 void BOARD_InitPins(void) {                                /*!< Function assigned for the core: Cortex-M4[m4] */
-    IOMUXC_SetPinMux(IOMUXC_UART4_RXD_UART4_RX, 0U);
-    IOMUXC_SetPinConfig(IOMUXC_UART4_RXD_UART4_RX, 
+    IOMUXC_SetPinMux(IOMUXC_SAI2_RXC_UART1_RX, 0U);
+    IOMUXC_SetPinConfig(IOMUXC_SAI2_RXC_UART1_RX, 
                         IOMUXC_SW_PAD_CTL_PAD_DSE(6U) |
                         IOMUXC_SW_PAD_CTL_PAD_FSEL(2U));
-    IOMUXC_SetPinMux(IOMUXC_UART4_TXD_UART4_TX, 0U);
-    IOMUXC_SetPinConfig(IOMUXC_UART4_TXD_UART4_TX, 
+    IOMUXC_SetPinMux(IOMUXC_SAI2_RXFS_UART1_TX, 0U);
+    IOMUXC_SetPinConfig(IOMUXC_SAI2_RXFS_UART1_TX, 
                         IOMUXC_SW_PAD_CTL_PAD_DSE(6U) |
                         IOMUXC_SW_PAD_CTL_PAD_FSEL(2U));
 }

--- a/boards/evkmimx8mm/board.c
+++ b/boards/evkmimx8mm/board.c
@@ -24,7 +24,7 @@
 void BOARD_InitDebugConsole(void)
 {
     uint32_t uartClkSrcFreq = BOARD_DEBUG_UART_CLK_FREQ;
-    CLOCK_EnableClock(kCLOCK_Uart4);
+    CLOCK_EnableClock(kCLOCK_Uart1);
     DbgConsole_Init(BOARD_DEBUG_UART_INSTANCE, BOARD_DEBUG_UART_BAUDRATE, BOARD_DEBUG_UART_TYPE, uartClkSrcFreq);
 }
 /* Initialize MPU, configure non-cacheable memory */

--- a/boards/evkmimx8mm/board.h
+++ b/boards/evkmimx8mm/board.h
@@ -19,13 +19,13 @@
 /* The UART to use for debug messages. */
 #define BOARD_DEBUG_UART_TYPE     kSerialPort_Uart
 #define BOARD_DEBUG_UART_BAUDRATE 115200u
-#define BOARD_DEBUG_UART_BASEADDR UART4_BASE
-#define BOARD_DEBUG_UART_INSTANCE 4U
+#define BOARD_DEBUG_UART_BASEADDR UART1_BASE
+#define BOARD_DEBUG_UART_INSTANCE 1U
 #define BOARD_DEBUG_UART_CLK_FREQ                                                           \
-    CLOCK_GetPllFreq(kCLOCK_SystemPll1Ctrl) / (CLOCK_GetRootPreDivider(kCLOCK_RootUart4)) / \
-        (CLOCK_GetRootPostDivider(kCLOCK_RootUart4)) / 10
-#define BOARD_UART_IRQ         UART4_IRQn
-#define BOARD_UART_IRQ_HANDLER UART4_IRQHandler
+    CLOCK_GetPllFreq(kCLOCK_SystemPll1Ctrl) / (CLOCK_GetRootPreDivider(kCLOCK_RootUart1)) / \
+        (CLOCK_GetRootPostDivider(kCLOCK_RootUart1)) / 10
+#define BOARD_UART_IRQ         UART1_IRQn
+#define BOARD_UART_IRQ_HANDLER UART1_IRQHandler

 #define GPV5_BASE_ADDR        (0x32500000)
 #define FORCE_INCR_OFFSET     (0x4044)

--- a/boards/evkmimx8mm/clock_config.c
+++ b/boards/evkmimx8mm/clock_config.c
@@ -99,8 +99,8 @@ void BOARD_BootClockRUN(void)
     //    CLOCK_SetRootDivider(kCLOCK_RootAxi, 1U, 2);
     //    CLOCK_SetRootMux(kCLOCK_RootAxi, kCLOCK_AxiRootmuxSysPll1); /* switch AXI to SYSTEM PLL1 800MHZ */

-    CLOCK_SetRootMux(kCLOCK_RootUart4, kCLOCK_UartRootmuxSysPll1Div10); /* Set UART source to SysPLL1 Div10 80MHZ */
-    CLOCK_SetRootDivider(kCLOCK_RootUart4, 1U, 1U);                     /* Set root clock to 80MHZ/ 1= 80MHZ */
+    CLOCK_SetRootMux(kCLOCK_RootUart1, kCLOCK_UartRootmuxSysPll1Div10); /* Set UART source to SysPLL1 Div10 80MHZ */
+    CLOCK_SetRootDivider(kCLOCK_RootUart1, 1U, 1U);                     /* Set root clock to 80MHZ/ 1= 80MHZ */

     CLOCK_EnableClock(kCLOCK_Rdc); /* Enable RDC clock */
     /* The purpose to enable the following modules clock is to make sure the M4 core could work normally when A53 core

2. Modify the TF-A code to assign peripherals to the correct RDC domain

The Ressource Domain Controller (RDC) is used to assign peripherals to either the A53 domain or the M4 domain. Using a peripheral from a domain that it is not assigned to usually leads to failures like system lockups.

The following code changes in imx-atf assign the UART1 used as debug console for the M4 core to the M4 domain. Other peripherals need to be added as required for the application.

TF-A code changes to assign UART1 to M4 domain
--- a/plat/imx/imx8m/imx8mm/imx8mm_bl31_setup.c
+++ b/plat/imx/imx8m/imx8mm/imx8mm_bl31_setup.c
@@ -58,7 +58,7 @@ static const struct imx_rdc_cfg rdc[] = {
        RDC_MDAn(RDC_MDA_M4, DID1),

        /* peripherals domain permission */
-       RDC_PDAPn(RDC_PDAP_UART4, D1R | D1W),
+       RDC_PDAPn(RDC_PDAP_UART1, D1R | D1W),
        RDC_PDAPn(RDC_PDAP_UART2, D0R | D0W),

        /* memory region */

3. Modify the Linux Devicetree

3.1 Adding the Devicetree Nodes

Step only needed for Example 2

This step is only needed for "Example 2". It can be skipped if only "Example 1" is used (loaded from U-Boot).

DDR addresses

The example from NXP is configured for the NXP EVK with 2GB of DDR RAM. If your hardware has less RAM available, you might have to adjust the memory mapping of the M4 app and change the devicetree accordingly. For 1GB of DDR using 0x77000000 as base for the M4 app and 0x78000000 for the shared resources should work. In the M4 app MEMORY in MIMX8MM6xxxxx_cm4_ddr_ram.ld and VDEV0_VRING_BASE in board.h needs to be changed.

Add the following nodes to your board devicetree's root node in order to set up the memory for the M4 core and the remoteproc driver.

Devicetree Nodes for the M4 Core
/ {
    [...]

    reserved-memory {
        #address-cells = <2>;
        #size-cells = <2>;
        ranges;

        m4_reserved: m4@0x80000000 {
            reg = <0 0x80000000 0 0x1000000>;
            no-map;
        };

        vdev0vring0: vdev0vring0@b8000000 {
            reg = <0 0xb8000000 0 0x8000>;
            no-map;
        };

        vdev0vring1: vdev0vring1@b8008000 {
            reg = <0 0xb8008000 0 0x8000>;
            no-map;
        };

        rsc_table: rsc_table@b80ff000 {
            reg = <0 0xb80ff000 0 0x1000>;
            no-map;
        };

        vdevbuffer: vdevbuffer@b8400000 {
            compatible = "shared-dma-pool";
            reg = <0 0xb8400000 0 0x100000>;
            no-map;
        };
    };

    imx8mm-cm4 {
        compatible = "fsl,imx8mm-cm4";
        clocks = <&clk IMX8MM_CLK_M4_DIV>;
        mbox-names = "tx", "rx", "rxdb";
        mboxes = <&mu 0 1
              &mu 1 1
              &mu 3 1>;
        memory-region = <&vdevbuffer>, <&vdev0vring0>, <&vdev0vring1>, <&rsc_table>;
        syscon = <&src>;
    };

    [...]
};

3.2 Disable M4 Peripherals in Linux

At this point we also need to make sure, that no peripherals are probed by Linux that are assigned to M4 domain. In our case we need to disable UART1:

--- a/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mm-kontron-n801x-s.dts
@@ -307,7 +307,7 @@ &uart1 {
        pinctrl-names = "default";
        pinctrl-0 = <&pinctrl_uart1>;
        uart-has-rtscts;
-       status = "okay";
+       status = "disabled";
 };

4. Modify the Kernel Configuration

Step only needed for Example 2

This step is only needed for "Example 2". It can be skipped if only "Example 1" is used (loaded from U-Boot).

In order to build the drivers used in this example, we enable the following in our defconfig.

+CONFIG_REMOTEPROC=y
+CONFIG_REMOTEPROC_CDEV=y
+CONFIG_IMX_REMOTEPROC=y
+CONFIG_RPMSG_CHAR=m
+CONFIG_RPMSG_CTRL=m
+CONFIG_RPMSG_VIRTIO=m
+CONFIG_IMX_RPMSG_PINGPONG=m

5. Example 1: Loading and Starting through U-Boot

The compiled application's BIN file is copied to the DDR via TFTP (or alternatively from some storage device).

=> tftp 0x42000000 hello_world.bin
Using ethernet@30be0000 device
TFTP from server 192.168.1.10; our IP address is 192.168.1.11
Filename 'hello_world.bin'.
Load address: 0x42000000
Loading: #
         2.7 MiB/s
done
Bytes transferred = 14260 (37b4 hex)

Next, we copy the executable from DDR to the internal TCML memory, where we want to start it from.

=> cp.b 0x42000000 0x7e0000 ${filesize}

At last we will use the bootaux command to start the app.

=> bootaux 0x7e0000
## No elf image at address 0x007e0000
## Starting auxiliary core stack = 0x20020000, pc = 0x1FFE02CD...

At this point you should closely watch the UART console attached to the M4. It will print "hello world" ans start echoing all characters it receives.

Linux Kernel Clock Gating

To make sure that the app running on the M4 core will continue to be executed and not freeze when Linux is booted, the kernel needs to be told to not gate the system clocks. This can be done by adding the parameter clk-imx8mm.mcore_booted=1 to the kernel commandline. Usually this can be done by appending the value to the bootargs_base variable in the U-Boot environment.

6. Example 2: Loading and Starting through Linux

Required Kernel Version

While "Example 1" works fine with the v5.10-ktn kernel branch, this example requires additional patches for the remoteproc and rpmsg frameworks from later kernel versions. To make it work you either need a recent mainline kernel (tested on 5.19-rc5) or you need to integrate the backport for v5.10 provided here.

After booting the system with the modifications applied as described before, we need to copy the compiled M4 application's ELF file to the root filesystem. In our example we copy the rpmsg_lite_pingpong_rtos_linux_remote.elf to /lib/firmware.

Next we can load the executable into the M4 core using the remoteproc sysfs interface:

root@kontron-mx8mm:~# echo -n rpmsg_lite_pingpong_rtos_linux_remote.elf > /sys/class/remoteproc/remoteproc0/firmware

Once loaded we can start the execution of the M4 application:

root@kontron-mx8mm:~# echo start > /sys/class/remoteproc/remoteproc0/state
[ 7348.685563] remoteproc remoteproc0: powering up imx-rproc
[ 7348.692688] remoteproc remoteproc0: Booting fw image rpmsg_lite_pingpong_rtos_linux_remote.elf, size 409576
[ 7348.702909]  remoteproc0#vdev0buffer: assigned reserved memory node vdevbuffer@b8400000
[ 7348.712718] virtio_rpmsg_bus virtio0: rpmsg host is online
[ 7348.718388]  remoteproc0#vdev0buffer: registered virtio0 (type 7)
[ 7348.724550] remoteproc remoteproc0: remote processor imx-rproc is now up
[ 7349.714126] virtio_rpmsg_bus virtio0: creating channel rpmsg-openamp-demo-channel addr 0x1e

From the kernel log we can see that the M4 core was brought up and the rpmsg application on the M4 already bound itself to the kernel and announced the available communication channels.

At this point you should also see some messages printed to the M4 console:

RPMSG Ping-Pong FreeRTOS RTOS API Demo...
RPMSG Share Base Addr is 0xb8000000
Link is up!
Nameservice announce sent.

7. Loading the rpmsg Client Driver Kernel Module

Now the last part of the demo is to load the rpmsg client driver that uses the message bus to communicate with the already running M4 application.

NXP provides a demo kernel driver imx_rpmsg_pingpong that communicates with the rpmsg_lite_pingpong_rtos app on the M4.

root@kontron-mx8mm:~# modprobe imx_rpmsg_pingpong

As soon as we load the driver module we will see the output of the messages being sent and received in the kernel log and on the M4 console.

Pingpong Demo Messages
root@kontron-mx8mm:~# modprobe imx_rpmsg_pingpong
[ 8073.867508] 90:init
[ 8073.869731] 42:rpmsg_pingpong_probe
[ 8073.873232] imx_rpmsg_pingpong virtio0.rpmsg-openamp-demo-channel.-1.30: new channel: 0x400 -> 0x1e!
[ 8073.885947] get 1 (src: 0x1e)
[ 8073.892351] get 3 (src: 0x1e)
[ 8073.897069] get 5 (src: 0x1e)
[ 8073.901646] get 7 (src: 0x1e)
[ 8073.906147] get 9 (src: 0x1e)
[ 8073.910645] get 11 (src: 0x1e)
[ 8073.915247] get 13 (src: 0x1e)
[ 8073.919844] get 15 (src: 0x1e)
[ 8073.924436] get 17 (src: 0x1e)
[ 8073.929131] get 19 (src: 0x1e)
[ 8073.933734] get 21 (src: 0x1e)
[ 8073.938321] get 23 (src: 0x1e)
[ 8073.942935] get 25 (src: 0x1e)
[ 8073.947551] get 27 (src: 0x1e)
[ 8073.952152] get 29 (src: 0x1e)
[ 8073.956758] get 31 (src: 0x1e)
[ 8073.961358] get 33 (src: 0x1e)
[ 8073.965946] get 35 (src: 0x1e)
[ 8073.970533] get 37 (src: 0x1e)
[ 8073.975128] get 39 (src: 0x1e)
[ 8073.979737] get 41 (src: 0x1e)
[ 8073.984334] get 43 (src: 0x1e)
[ 8073.988941] get 45 (src: 0x1e)
[ 8073.993539] get 47 (src: 0x1e)
[ 8073.998127] get 49 (src: 0x1e)
[ 8074.002713] get 51 (src: 0x1e)
[ 8074.007308] get 53 (src: 0x1e)
[ 8074.011917] get 55 (src: 0x1e)
[ 8074.016512] get 57 (src: 0x1e)
[ 8074.021120] get 59 (src: 0x1e)
[ 8074.025718] get 61 (src: 0x1e)
[ 8074.030304] get 63 (src: 0x1e)
[ 8074.034902] get 65 (src: 0x1e)
[ 8074.039505] get 67 (src: 0x1e)
[ 8074.044105] get 69 (src: 0x1e)
[ 8074.048713] get 71 (src: 0x1e)
[ 8074.053332] get 73 (src: 0x1e)
[ 8074.057918] get 75 (src: 0x1e)
[ 8074.062506] get 77 (src: 0x1e)
[ 8074.067102] get 79 (src: 0x1e)
[ 8074.071710] get 81 (src: 0x1e)
[ 8074.076308] get 83 (src: 0x1e)
[ 8074.080916] get 85 (src: 0x1e)
[ 8074.085514] get 87 (src: 0x1e)
[ 8074.090103] get 89 (src: 0x1e)
[ 8074.094690] get 91 (src: 0x1e)
[ 8074.099285] get 93 (src: 0x1e)
[ 8074.103889] get 95 (src: 0x1e)
[ 8074.108488] get 97 (src: 0x1e)
[ 8074.113095] get 99 (src: 0x1e)
[ 8074.117698] get 101 (src: 0x1e)
[ 8074.120850] imx_rpmsg_pingpong virtio0.rpmsg-openamp-demo-channel.-1.30: goodbye!
[ 8074.229325] imx-rproc imx8mm-cm4: imx_rproc_kick: failed (0, err:-62)
RPMSG Ping-Pong FreeRTOS RTOS API Demo...
RPMSG Share Base Addr is 0xb8000000
Link is up!
Nameservice announce sent.
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Waiting for ping...
Sending pong...
Ping pong done, deinitializing...
Looping forever...

References