From d4ab7cdb933574c1c9a008e62c4393b3e96df378 Mon Sep 17 00:00:00 2001 From: Valentin Roland Date: Fri, 24 Nov 2023 23:38:05 +0100 Subject: [PATCH] implement lut lookup using vector extensions --- examples/dragon/main/CMakeLists.txt | 2 +- examples/dragon/main/func.h | 0 examples/dragon/main/main.c | 121 +++++++++++++++++++++++++--- examples/dragon/main/test.S | 109 +++++++++++++++++++++++++ 4 files changed, 219 insertions(+), 13 deletions(-) create mode 100644 examples/dragon/main/func.h create mode 100644 examples/dragon/main/test.S diff --git a/examples/dragon/main/CMakeLists.txt b/examples/dragon/main/CMakeLists.txt index 475be06b..3a86b488 100644 --- a/examples/dragon/main/CMakeLists.txt +++ b/examples/dragon/main/CMakeLists.txt @@ -1,3 +1,3 @@ -set(app_sources "main.c") +set(app_sources "test.S" "main.c") idf_component_register(SRCS ${app_sources} REQUIRES epdiy) diff --git a/examples/dragon/main/func.h b/examples/dragon/main/func.h new file mode 100644 index 00000000..e69de29b diff --git a/examples/dragon/main/main.c b/examples/dragon/main/main.c index 5f6e374a..f0c8352a 100644 --- a/examples/dragon/main/main.c +++ b/examples/dragon/main/main.c @@ -1,5 +1,6 @@ /* Simple firmware for a ESP32 displaying a static image on an EPaper Screen */ +#include #include "esp_heap_caps.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" @@ -7,6 +8,8 @@ #include "dragon.h" #include "epd_highlevel.h" #include "epdiy.h" +#include "output_common/lut.h" +#include "ED097TC2.h" EpdiyHighlevelState hl; @@ -17,26 +20,120 @@ EpdiyHighlevelState hl; #define DEMO_BOARD epd_board_v7 #endif -void idf_loop() { - EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height}; +#include "esp_timer.h" +#include "esp_random.h" + +void IRAM_ATTR calc_epd_input_1ppB_64k(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width); + +esp_err_t calc_epd_input_1ppB_64k_ve(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width); + +enum EpdDrawError calculate_lut( + uint8_t* lut, + int lut_size, + enum EpdDrawMode mode, + int frame, + const EpdWaveformPhases* phases +); + +void IRAM_ATTR benchmark_lut_calculation(void) { + const unsigned MEASUREMENTS = 1000; + + // fill a random benchmark line + static uint32_t line[468] __attribute__((aligned(16))); + + esp_fill_random(line, 1872); + + uint8_t* lut = heap_caps_malloc(1 << 16, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); + calculate_lut(lut, 1 << 16, MODE_GC16 | MODE_PACKING_1PPB_DIFFERENCE, 5, ed097tc2.mode_data[MODE_GC16]->range_data[0]); + + + uint32_t* lut_1k = heap_caps_malloc(1 << 10, MALLOC_CAP_32BIT | MALLOC_CAP_INTERNAL); + for (int i=0; i<256; i++) { + lut_1k[i] = lut[i] & 0x3; + } + + line[0] = 0x0F000000; + line[1] = 0x01020304; + line[2] = 0xB33348D3; + line[3] = 0x38E2C376; + + volatile uint8_t target_buf[468]; + uint8_t ground_truth[468]; + + int64_t start = esp_timer_get_time(); + + for (int retries = 0; retries < MEASUREMENTS; retries++) { + calc_epd_input_1ppB_64k(line, target_buf, lut, 1872); + } + + int64_t end = esp_timer_get_time(); + + memcpy(ground_truth, target_buf, 468); + memset(target_buf, 0, 468); - int temperature = 25; + printf("plain: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n", + MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS); - epd_poweron(); - epd_fullclear(&hl, temperature); + start = esp_timer_get_time(); - epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl)); + for (int retries = 0; retries < MEASUREMENTS; retries++) { + memcpy(target_buf, line, 468); + memcpy(target_buf, line, 468); + memcpy(target_buf, line, 468); + memcpy(target_buf, line, 468); + } - enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature); - epd_poweroff(); + end = esp_timer_get_time(); - vTaskDelay(1000); + printf("memcpy: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n", + MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS); + + memset(target_buf, 0, 468); + + start = esp_timer_get_time(); + + for (int retries = 0; retries < MEASUREMENTS; retries++) { + calc_epd_input_1ppB_64k_ve(line, target_buf, lut_1k, 1872); + } + + end = esp_timer_get_time(); + + printf("optimized: %u iterations took %f milliseconds (%llu microseconds per invocation)\n", + MEASUREMENTS, (end - start)/1000.0, (end - start)/MEASUREMENTS); + + if (memcmp(ground_truth, target_buf, 468) != 0) { + printf("COMPARE FAILED! \n"); + for (int i=0; i<30; i++) { + printf("%lX %X %X\n", line[i], ground_truth[i], target_buf[i]); + } + } + + free(lut); + + vTaskDelay(10); } +void idf_loop() { + benchmark_lut_calculation(); + // EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height}; + + // int temperature = 25; + + // epd_poweron(); + // epd_fullclear(&hl, temperature); + + // epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl)); + + // enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature); + // epd_poweroff(); + + // vTaskDelay(1000); +}// + void idf_setup() { - epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K); - epd_set_vcom(1560); - hl = epd_hl_init(EPD_BUILTIN_WAVEFORM); + // epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K); + // epd_set_vcom(1560); + // hl = epd_hl_init(EPD_BUILTIN_WAVEFORM); } #ifndef ARDUINO_ARCH_ESP32 diff --git a/examples/dragon/main/test.S b/examples/dragon/main/test.S new file mode 100644 index 00000000..729b7ef5 --- /dev/null +++ b/examples/dragon/main/test.S @@ -0,0 +1,109 @@ +#include +#include + +.text +.align 4 +.global calc_epd_input_1ppB_64k_ve +.type calc_epd_input_1ppB_64k_ve,@function + +// // CRASH AND BURN for debugging +// EE.MOVI.32.A q3, a2, 0 +// EE.MOVI.32.A q3, a3, 1 +// EE.MOVI.32.A q3, a4, 2 +// EE.MOVI.32.A q3, a5, 3 +// l8ui a10, a10, 0 + +// void calc_epd_input_1ppB_64k( +// const uint32_t *ld, +// uint8_t *epd_input, +// const uint8_t *conversion_lut, +// uint32_t epd_width +//); +calc_epd_input_1ppB_64k_ve: +// input - a2 +// output - a3 +// lut - a4 +// len - a5 + + entry a1, 32 + srli a5, a5, 4 + + + // bitmasks for bit shift by multiplication + movi a10, 0x40001000 + EE.MOVI.32.Q q4,a10,0 + movi a10, 0x04000100 + EE.MOVI.32.Q q4,a10,1 + movi a10, 0x00400010 + EE.MOVI.32.Q q4,a10,2 + movi a10, 0x00040001 + EE.MOVI.32.Q q4,a10,3 + + // have zero in a10 + movi a10, 0 + + // TODO: can be moved out + EE.ZERO.Q q0 + + EE.VLD.128.IP q1, a2, 16 + + loopnez a5, .loop_end_lut_lookup + + // q1, q0 contain the input bytes, zero-extended to 16 bytes + EE.VZIP.8 q1, q0 + + // load 32-bit LUT results into q2, q3 + EE.LDXQ.32 q2, q0, a4, 0, 6 + EE.LDXQ.32 q2, q0, a4, 1, 7 + EE.LDXQ.32 q2, q0, a4, 2, 4 + EE.LDXQ.32 q2, q0, a4, 3, 5 + EE.LDXQ.32 q3, q0, a4, 0, 2 + EE.LDXQ.32 q3, q0, a4, 1, 3 + EE.LDXQ.32 q3, q0, a4, 2, 0 + EE.LDXQ.32 q3, q0, a4, 3, 1 + + EE.ZERO.ACCX + + // zip to have 16bit LUT results in q2, q3 zeroes + EE.VUNZIP.16 q2, q3 + + EE.VMULAS.U16.ACCX q2,q4 + + // load 32-bit LUT results into q3, q0 + // We interleave the data loading with retrieving the result + // from the accumulator, to have better pipeline utilization + EE.LDXQ.32 q2, q1, a4, 0, 6 + EE.LDXQ.32 q2, q1, a4, 1, 7 + EE.LDXQ.32 q2, q1, a4, 2, 4 + EE.LDXQ.32 q2, q1, a4, 3, 5 + EE.LDXQ.32 q0, q1, a4, 0, 2 + EE.LDXQ.32 q0, q1, a4, 1, 3 + EE.LDXQ.32 q0, q1, a4, 2, 0 + EE.LDXQ.32 q0, q1, a4, 3, 1 + + // shift result by zero and store in a6 + EE.SRS.ACCX a6, a10, 0 + + EE.ZERO.ACCX + + // zip to have 16bit LUT results in q2, a0 zeroes + EE.VUNZIP.16 q2, q0 + + slli a6, a6, 16 + + // FIXME: Loads beyond bounds + EE.VMULAS.U16.ACCX.LD.IP q1, a2, 16, q2, q4 + + // shift result by zero and store in a7 + EE.SRS.ACCX a7, a10, 0 + + // combine results + or a6, a6, a7 + s32i a6, a3, 0 + addi.n a3, a3, 4 + + +.loop_end_lut_lookup: + movi.n a2, 0 // return status ESP_OK + retw.n +