From 8e1c61a6f7a76bacedc7311ffce9e4b79c2aab8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ars=C3=A8ne=20P=C3=A9rard-Gayot?= Date: Thu, 19 Nov 2020 16:07:06 +0100 Subject: [PATCH] Improvements to the `RenderLine()` function (#920) * Improvements to the `RenderLine()` function - Simplify by using indices instead of incrementing pointers - Improve performance in the case where mask != -1 by only processing the bits that are set --- Source/render.cpp | 66 ++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/Source/render.cpp b/Source/render.cpp index c1f4abb83..3eff5c2f2 100644 --- a/Source/render.cpp +++ b/Source/render.cpp @@ -133,54 +133,66 @@ static DWORD LeftFoliageMask[TILE_HEIGHT] = { 0xFFFFFFF0, 0xFFFFFFFC, }; -inline static void RenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) -{ +inline static int count_leading_zeros(DWORD mask) { + // Note: This assumes that the argument is not zero, + // which means there is at least one bit set. +#if defined(__GNUC__) || defined(__clang__) + return __builtin_clz(mask); +#else int i; + for (i = 0; (mask & 0x80000000) == 0; i++, mask <<= 1); + return i; +#endif +} +template +void foreach_set_bit(DWORD mask, const F& f) { + int i = 0; + while (mask != 0) { + int z = count_leading_zeros(mask); + i += z, mask <<= z; + for (; mask & 0x80000000; i++, mask <<= 1) + f(i); + } +} + +inline static void RenderLine(BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) +{ #ifdef NO_OVERDRAW if (*dst < gpBufStart || *dst > gpBufEnd) { - *src += n; - *dst += n; - return; + goto skip; } #endif if (mask == 0xFFFFFFFF) { if (light_table_index == lightmax) { memset(*dst, 0, n); - (*src) += n; - (*dst) += n; } else if (light_table_index == 0) { memcpy(*dst, *src, n); - (*src) += n; - (*dst) += n; } else { - for (i = 0; i < n; i++, (*src)++, (*dst)++) { - (*dst)[0] = tbl[(*src)[0]]; + for (int i = 0; i < n; i++) { + (*dst)[i] = tbl[(*src)[i]]; } } } else { + // The number of iterations is anyway limited by the size of the mask. + // So we can limit it by ANDing the mask with another mask that only keeps + // iterations that are lower than n. We can now avoid testing if i < n + // at every loop iteration. + mask &= ((((DWORD)1) << n) - 1) << ((sizeof(DWORD) * CHAR_BIT) - n); + if (light_table_index == lightmax) { - (*src) += n; - for (i = 0; i < n; i++, (*dst)++, mask <<= 1) { - if (mask & 0x80000000) { - (*dst)[0] = 0; - } - } + foreach_set_bit(mask, [=] (int i) { (*dst)[i] = 0; }); } else if (light_table_index == 0) { - for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) { - if (mask & 0x80000000) { - (*dst)[0] = (*src)[0]; - } - } + foreach_set_bit(mask, [=] (int i) { (*dst)[i] = (*src)[i]; }); } else { - for (i = 0; i < n; i++, (*src)++, (*dst)++, mask <<= 1) { - if (mask & 0x80000000) { - (*dst)[0] = tbl[(*src)[0]]; - } - } + foreach_set_bit(mask, [=] (int i) { (*dst)[i] = tbl[(*src)[i]]; }); } } + +skip: + (*src) += n; + (*dst) += n; } #if defined(__clang__) || defined(__GNUC__)