Skip to content

Commit

Permalink
implement lut lookup using vector extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
vroland committed Nov 24, 2023
1 parent bd3194a commit d4ab7cd
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 13 deletions.
2 changes: 1 addition & 1 deletion examples/dragon/main/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
set(app_sources "main.c")
set(app_sources "test.S" "main.c")

idf_component_register(SRCS ${app_sources} REQUIRES epdiy)
Empty file added examples/dragon/main/func.h
Empty file.
121 changes: 109 additions & 12 deletions examples/dragon/main/main.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
/* Simple firmware for a ESP32 displaying a static image on an EPaper Screen */

#include <string.h>
#include "esp_heap_caps.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"

#include "dragon.h"
#include "epd_highlevel.h"
#include "epdiy.h"
#include "output_common/lut.h"
#include "ED097TC2.h"

EpdiyHighlevelState hl;

Expand All @@ -17,26 +20,120 @@ EpdiyHighlevelState hl;
#define DEMO_BOARD epd_board_v7
#endif

void idf_loop() {
EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height};
#include "esp_timer.h"
#include "esp_random.h"

void IRAM_ATTR calc_epd_input_1ppB_64k(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width);

esp_err_t calc_epd_input_1ppB_64k_ve(const uint32_t *ld, uint8_t *epd_input, const uint8_t *conversion_lut, uint32_t epd_width);

enum EpdDrawError calculate_lut(
uint8_t* lut,
int lut_size,
enum EpdDrawMode mode,
int frame,
const EpdWaveformPhases* phases
);

void IRAM_ATTR benchmark_lut_calculation(void) {
const unsigned MEASUREMENTS = 1000;

// fill a random benchmark line
static uint32_t line[468] __attribute__((aligned(16)));

esp_fill_random(line, 1872);

uint8_t* lut = heap_caps_malloc(1 << 16, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
calculate_lut(lut, 1 << 16, MODE_GC16 | MODE_PACKING_1PPB_DIFFERENCE, 5, ed097tc2.mode_data[MODE_GC16]->range_data[0]);


uint32_t* lut_1k = heap_caps_malloc(1 << 10, MALLOC_CAP_32BIT | MALLOC_CAP_INTERNAL);
for (int i=0; i<256; i++) {
lut_1k[i] = lut[i] & 0x3;
}

line[0] = 0x0F000000;
line[1] = 0x01020304;
line[2] = 0xB33348D3;
line[3] = 0x38E2C376;

volatile uint8_t target_buf[468];
uint8_t ground_truth[468];

int64_t start = esp_timer_get_time();

for (int retries = 0; retries < MEASUREMENTS; retries++) {
calc_epd_input_1ppB_64k(line, target_buf, lut, 1872);
}

int64_t end = esp_timer_get_time();

memcpy(ground_truth, target_buf, 468);
memset(target_buf, 0, 468);

int temperature = 25;
printf("plain: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n",
MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS);

epd_poweron();
epd_fullclear(&hl, temperature);
start = esp_timer_get_time();

epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl));
for (int retries = 0; retries < MEASUREMENTS; retries++) {
memcpy(target_buf, line, 468);
memcpy(target_buf, line, 468);
memcpy(target_buf, line, 468);
memcpy(target_buf, line, 468);
}

enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature);
epd_poweroff();
end = esp_timer_get_time();

vTaskDelay(1000);
printf("memcpy: %u iterations took %llu milliseconds (%llu microseconds per invocation)\n",
MEASUREMENTS, (end - start)/1000, (end - start)/MEASUREMENTS);

memset(target_buf, 0, 468);

start = esp_timer_get_time();

for (int retries = 0; retries < MEASUREMENTS; retries++) {
calc_epd_input_1ppB_64k_ve(line, target_buf, lut_1k, 1872);
}

end = esp_timer_get_time();

printf("optimized: %u iterations took %f milliseconds (%llu microseconds per invocation)\n",
MEASUREMENTS, (end - start)/1000.0, (end - start)/MEASUREMENTS);

if (memcmp(ground_truth, target_buf, 468) != 0) {
printf("COMPARE FAILED! \n");
for (int i=0; i<30; i++) {
printf("%lX %X %X\n", line[i], ground_truth[i], target_buf[i]);
}
}

free(lut);

vTaskDelay(10);
}

void idf_loop() {
benchmark_lut_calculation();
// EpdRect dragon_area = {.x = 0, .y = 0, .width = dragon_width, .height = dragon_height};

// int temperature = 25;

// epd_poweron();
// epd_fullclear(&hl, temperature);

// epd_copy_to_framebuffer(dragon_area, dragon_data, epd_hl_get_framebuffer(&hl));

// enum EpdDrawError _err = epd_hl_update_screen(&hl, MODE_GC16, temperature);
// epd_poweroff();

// vTaskDelay(1000);
}//

void idf_setup() {
epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K);
epd_set_vcom(1560);
hl = epd_hl_init(EPD_BUILTIN_WAVEFORM);
// epd_init(&DEMO_BOARD, &ED097TC2, EPD_LUT_64K);
// epd_set_vcom(1560);
// hl = epd_hl_init(EPD_BUILTIN_WAVEFORM);
}

#ifndef ARDUINO_ARCH_ESP32
Expand Down
109 changes: 109 additions & 0 deletions examples/dragon/main/test.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#include <xtensa/config/core-isa.h>
#include <xtensa/config/core-matmap.h>

.text
.align 4
.global calc_epd_input_1ppB_64k_ve
.type calc_epd_input_1ppB_64k_ve,@function

// // CRASH AND BURN for debugging
// EE.MOVI.32.A q3, a2, 0
// EE.MOVI.32.A q3, a3, 1
// EE.MOVI.32.A q3, a4, 2
// EE.MOVI.32.A q3, a5, 3
// l8ui a10, a10, 0

// void calc_epd_input_1ppB_64k(
// const uint32_t *ld,
// uint8_t *epd_input,
// const uint8_t *conversion_lut,
// uint32_t epd_width
//);
calc_epd_input_1ppB_64k_ve:
// input - a2
// output - a3
// lut - a4
// len - a5

entry a1, 32
srli a5, a5, 4


// bitmasks for bit shift by multiplication
movi a10, 0x40001000
EE.MOVI.32.Q q4,a10,0
movi a10, 0x04000100
EE.MOVI.32.Q q4,a10,1
movi a10, 0x00400010
EE.MOVI.32.Q q4,a10,2
movi a10, 0x00040001
EE.MOVI.32.Q q4,a10,3

// have zero in a10
movi a10, 0

// TODO: can be moved out
EE.ZERO.Q q0

EE.VLD.128.IP q1, a2, 16

loopnez a5, .loop_end_lut_lookup

// q1, q0 contain the input bytes, zero-extended to 16 bytes
EE.VZIP.8 q1, q0

// load 32-bit LUT results into q2, q3
EE.LDXQ.32 q2, q0, a4, 0, 6
EE.LDXQ.32 q2, q0, a4, 1, 7
EE.LDXQ.32 q2, q0, a4, 2, 4
EE.LDXQ.32 q2, q0, a4, 3, 5
EE.LDXQ.32 q3, q0, a4, 0, 2
EE.LDXQ.32 q3, q0, a4, 1, 3
EE.LDXQ.32 q3, q0, a4, 2, 0
EE.LDXQ.32 q3, q0, a4, 3, 1

EE.ZERO.ACCX

// zip to have 16bit LUT results in q2, q3 zeroes
EE.VUNZIP.16 q2, q3

EE.VMULAS.U16.ACCX q2,q4

// load 32-bit LUT results into q3, q0
// We interleave the data loading with retrieving the result
// from the accumulator, to have better pipeline utilization
EE.LDXQ.32 q2, q1, a4, 0, 6
EE.LDXQ.32 q2, q1, a4, 1, 7
EE.LDXQ.32 q2, q1, a4, 2, 4
EE.LDXQ.32 q2, q1, a4, 3, 5
EE.LDXQ.32 q0, q1, a4, 0, 2
EE.LDXQ.32 q0, q1, a4, 1, 3
EE.LDXQ.32 q0, q1, a4, 2, 0
EE.LDXQ.32 q0, q1, a4, 3, 1

// shift result by zero and store in a6
EE.SRS.ACCX a6, a10, 0

EE.ZERO.ACCX

// zip to have 16bit LUT results in q2, a0 zeroes
EE.VUNZIP.16 q2, q0

slli a6, a6, 16

// FIXME: Loads beyond bounds
EE.VMULAS.U16.ACCX.LD.IP q1, a2, 16, q2, q4

// shift result by zero and store in a7
EE.SRS.ACCX a7, a10, 0

// combine results
or a6, a6, a7
s32i a6, a3, 0
addi.n a3, a3, 4


.loop_end_lut_lookup:
movi.n a2, 0 // return status ESP_OK
retw.n

0 comments on commit d4ab7cd

Please sign in to comment.