implement lut lookup using vector extensions

vroland · Nov 24, 2023 · d4ab7cd · d4ab7cd
1 parent bd3194a
commit d4ab7cd
Show file tree

Hide file tree

Showing 4 changed files with 219 additions and 13 deletions.
diff --git a/examples/dragon/main/CMakeLists.txt b/examples/dragon/main/CMakeLists.txt
@@ -1,3 +1,3 @@
-set(app_sources "main.c")
+set(app_sources "test.S" "main.c")
 
 idf_component_register(SRCS ${app_sources} REQUIRES epdiy)
diff --git a/examples/dragon/main/func.h b/examples/dragon/main/func.h
diff --git a/examples/dragon/main/main.c b/examples/dragon/main/main.c
@@ -1,12 +1,15 @@
 /* Simple firmware for a ESP32 displaying a static image on an EPaper Screen */
 
+#include <string.h>
 #include "esp_heap_caps.h"
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 
 #include "dragon.h"
 #include "epd_highlevel.h"
 #include "epdiy.h"
+#include "output_common/lut.h"
+#include "ED097TC2.h"
 
 EpdiyHighlevelState hl;
 
@@ -17,26 +20,120 @@ EpdiyHighlevelState hl;
 #define DEMO_BOARD epd_board_v7
 #endif
 
-void idf_loop() {
-    EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height};
+#include "esp_timer.h"
+#include "esp_random.h"
+
+void IRAM_ATTR calc_epd_input_1ppB_64k(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width);
+
+esp_err_t calc_epd_input_1ppB_64k_ve(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width);
+
+enum EpdDrawError calculate_lut(
+    uint8_t* lut,
+    int lut_size,
+    enum EpdDrawMode mode,
+    int frame,
+    const EpdWaveformPhases* phases
+);
+
+void IRAM_ATTR benchmark_lut_calculation(void) {
+    const unsigned MEASUREMENTS = 1000;
+
+    // fill a random benchmark line
+    static uint32_t line[468] __attribute__((aligned(16)));
+
+    esp_fill_random(line, 1872);
+
+    uint8_t* lut = heap_caps_malloc(1 << 16, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
+    calculate_lut(lut, 1 << 16, MODE_GC16 | MODE_PACKING_1PPB_DIFFERENCE, 5, ed097tc2.mode_data[MODE_GC16]->range_data[0]);
+
+
+    uint32_t* lut_1k = heap_caps_malloc(1 << 10, MALLOC_CAP_32BIT | MALLOC_CAP_INTERNAL);
+    for (int i=0; i<256; i++) {
+        lut_1k[i] = lut[i] & 0x3;
+    }
+
+    line[0] = 0x0F000000;
+    line[1] = 0x01020304;
+    line[2] = 0xB33348D3;
+    line[3] = 0x38E2C376;
+
+    volatile uint8_t target_buf[468];
+    uint8_t ground_truth[468];
+
+    int64_t start = esp_timer_get_time();
+
+    for (int retries = 0; retries < MEASUREMENTS; retries++) {
+        calc_epd_input_1ppB_64k(line, target_buf, lut, 1872);
+    }
+
+    int64_t end = esp_timer_get_time();
+
+    memcpy(ground_truth, target_buf, 468);
+    memset(target_buf, 0, 468);
 
-    int temperature = 25;
+    printf("plain: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n",
+           MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS);
 
-    epd_poweron();
-    epd_fullclear(&hl, temperature);
+    start = esp_timer_get_time();
 
-    epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl));
+    for (int retries = 0; retries < MEASUREMENTS; retries++) {
+        memcpy(target_buf, line, 468);
+        memcpy(target_buf, line, 468);
+        memcpy(target_buf, line, 468);
+        memcpy(target_buf, line, 468);
+    }
 
-    enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature);
-    epd_poweroff();
+    end = esp_timer_get_time();
 
-    vTaskDelay(1000);
+    printf("memcpy: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n",
+           MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS);
+
+    memset(target_buf, 0, 468);
+
+    start = esp_timer_get_time();
+
+    for (int retries = 0; retries < MEASUREMENTS; retries++) {
+        calc_epd_input_1ppB_64k_ve(line, target_buf, lut_1k, 1872);
+    }
+
+    end = esp_timer_get_time();
+
+    printf("optimized: %u iterations took %f milliseconds (%llu microseconds per invocation)\n",
+           MEASUREMENTS, (end - start)/1000.0, (end - start)/MEASUREMENTS);
+
+    if (memcmp(ground_truth, target_buf, 468) != 0) {
+        printf("COMPARE FAILED! \n");
+        for (int i=0; i<30; i++) {
+            printf("%lX     %X %X\n", line[i], ground_truth[i], target_buf[i]);
+        }
+    }
+
+    free(lut);
+
+    vTaskDelay(10);
 }
 
+void idf_loop() {
+    benchmark_lut_calculation();
+ //   EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height};
+
+ //   int temperature = 25;
+
+ //   epd_poweron();
+ //   epd_fullclear(&hl, temperature);
+
+ //   epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl));
+
+ //   enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature);
+ //   epd_poweroff();
+
+ //   vTaskDelay(1000);
+}//
+
 void idf_setup() {
-    epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K);
-    epd_set_vcom(1560);
-    hl = epd_hl_init(EPD_BUILTIN_WAVEFORM);
+    // epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K);
+    // epd_set_vcom(1560);
+    // hl = epd_hl_init(EPD_BUILTIN_WAVEFORM);
 }
 
 #ifndef ARDUINO_ARCH_ESP32

diff --git a/examples/dragon/main/test.S b/examples/dragon/main/test.S
@@ -0,0 +1,109 @@
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+.text
+.align  4
+.global calc_epd_input_1ppB_64k_ve
+.type   calc_epd_input_1ppB_64k_ve,@function
+
+//        // CRASH AND BURN for debugging
+//        EE.MOVI.32.A q3, a2, 0
+//        EE.MOVI.32.A q3, a3, 1
+//        EE.MOVI.32.A q3, a4, 2
+//        EE.MOVI.32.A q3, a5, 3
+//        l8ui a10, a10, 0
+
+// void calc_epd_input_1ppB_64k(
+//    const uint32_t *ld,
+//    uint8_t *epd_input,
+//    const uint8_t *conversion_lut,
+//    uint32_t epd_width
+//);
+calc_epd_input_1ppB_64k_ve: 
+// input   - a2
+// output   - a3
+// lut   - a4
+// len      - a5
+
+    entry	a1, 32
+    srli a5, a5, 4
+
+
+    // bitmasks for bit shift by multiplication
+    movi a10, 0x40001000
+    EE.MOVI.32.Q q4,a10,0
+    movi a10, 0x04000100
+    EE.MOVI.32.Q q4,a10,1
+    movi a10, 0x00400010
+    EE.MOVI.32.Q q4,a10,2
+    movi a10, 0x00040001
+    EE.MOVI.32.Q q4,a10,3
+
+    // have zero in a10
+    movi a10, 0
+
+    // TODO: can be moved out
+    EE.ZERO.Q q0
+
+    EE.VLD.128.IP q1, a2, 16
+
+    loopnez a5, .loop_end_lut_lookup
+
+        // q1, q0 contain the input bytes, zero-extended to 16 bytes
+        EE.VZIP.8 q1, q0
+
+        // load 32-bit LUT results into q2, q3
+        EE.LDXQ.32 q2, q0, a4, 0, 6
+        EE.LDXQ.32 q2, q0, a4, 1, 7
+        EE.LDXQ.32 q2, q0, a4, 2, 4
+        EE.LDXQ.32 q2, q0, a4, 3, 5
+        EE.LDXQ.32 q3, q0, a4, 0, 2
+        EE.LDXQ.32 q3, q0, a4, 1, 3
+        EE.LDXQ.32 q3, q0, a4, 2, 0
+        EE.LDXQ.32 q3, q0, a4, 3, 1
+
+        EE.ZERO.ACCX
+
+        // zip to have 16bit LUT results in q2, q3 zeroes
+        EE.VUNZIP.16 q2, q3
+
+        EE.VMULAS.U16.ACCX q2,q4
+
+        // load 32-bit LUT results into q3, q0
+        // We interleave the data loading with retrieving the result
+        // from the accumulator, to have better pipeline utilization
+        EE.LDXQ.32 q2, q1, a4, 0, 6
+        EE.LDXQ.32 q2, q1, a4, 1, 7
+        EE.LDXQ.32 q2, q1, a4, 2, 4
+        EE.LDXQ.32 q2, q1, a4, 3, 5
+        EE.LDXQ.32 q0, q1, a4, 0, 2
+        EE.LDXQ.32 q0, q1, a4, 1, 3
+        EE.LDXQ.32 q0, q1, a4, 2, 0
+        EE.LDXQ.32 q0, q1, a4, 3, 1
+
+        // shift result by zero and store in a6
+        EE.SRS.ACCX a6, a10, 0
+
+        EE.ZERO.ACCX
+
+        // zip to have 16bit LUT results in q2, a0 zeroes
+        EE.VUNZIP.16 q2, q0
+
+        slli a6, a6, 16
+
+        // FIXME: Loads beyond bounds
+        EE.VMULAS.U16.ACCX.LD.IP q1, a2, 16, q2, q4
+
+        // shift result by zero and store in a7
+        EE.SRS.ACCX a7, a10, 0
+
+        // combine results
+        or a6, a6, a7
+        s32i      a6, a3, 0
+        addi.n a3, a3, 4
+
+
+.loop_end_lut_lookup:
+    movi.n	a2, 0 // return status ESP_OK
+    retw.n
+