From ae22644d4347b34f08a5b157288f72587b147676 Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Sat, 13 Mar 2021 03:30:10 +0000 Subject: [PATCH] CelOutputBuffer: Recover lost performance Makes `RenderLine` hotspot manipulate the buffer directly again. FPS: 195 -> 275 --- Source/render.cpp | 120 ++++++++++++++++++++++++---------------------- defs.h | 12 +++++ 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/Source/render.cpp b/Source/render.cpp index 4f2460263..28314448c 100644 --- a/Source/render.cpp +++ b/Source/render.cpp @@ -9,6 +9,8 @@ DEVILUTION_BEGIN_NAMESPACE #define NO_OVERDRAW +namespace { + enum { RT_SQUARE, RT_TRANSPARENT, @@ -19,7 +21,7 @@ enum { }; /** Fully transparent variant of WallMask. */ -static DWORD WallMask_FullyTrasparent[TILE_HEIGHT] = { +const DWORD WallMask_FullyTrasparent[TILE_HEIGHT] = { 0x00000000, 0x00000000, 0x00000000, @@ -54,7 +56,7 @@ static DWORD WallMask_FullyTrasparent[TILE_HEIGHT] = { 0x00000000 }; /** Transparent variant of RightMask. */ -static DWORD RightMask_Transparent[TILE_HEIGHT] = { +const DWORD RightMask_Transparent[TILE_HEIGHT] = { 0xC0000000, 0xF0000000, 0xFC000000, @@ -89,7 +91,7 @@ static DWORD RightMask_Transparent[TILE_HEIGHT] = { 0xFFFFFFFF }; /** Transparent variant of LeftMask. */ -static DWORD LeftMask_Transparent[TILE_HEIGHT] = { +const DWORD LeftMask_Transparent[TILE_HEIGHT] = { 0x00000003, 0x0000000F, 0x0000003F, @@ -124,7 +126,7 @@ static DWORD LeftMask_Transparent[TILE_HEIGHT] = { 0xFFFFFFFF }; /** Specifies the draw masks used to render transparency of the right side of tiles. */ -static DWORD RightMask[TILE_HEIGHT] = { +const DWORD RightMask[TILE_HEIGHT] = { 0xEAAAAAAA, 0xF5555555, 0xFEAAAAAA, @@ -159,7 +161,7 @@ static DWORD RightMask[TILE_HEIGHT] = { 0xFFFFFFFF }; /** Specifies the draw masks used to render transparency of the left side of tiles. */ -static DWORD LeftMask[TILE_HEIGHT] = { +const DWORD LeftMask[TILE_HEIGHT] = { 0xAAAAAAAB, 0x5555555F, 0xAAAAAABF, @@ -194,7 +196,7 @@ static DWORD LeftMask[TILE_HEIGHT] = { 0xFFFFFFFF }; /** Specifies the draw masks used to render transparency of wall tiles. */ -static DWORD WallMask[TILE_HEIGHT] = { +const DWORD WallMask[TILE_HEIGHT] = { 0xAAAAAAAA, 0x55555555, 0xAAAAAAAA, @@ -229,7 +231,7 @@ static DWORD WallMask[TILE_HEIGHT] = { 0x55555555 }; /** Fully opaque mask */ -static DWORD SolidMask[TILE_HEIGHT] = { +const DWORD SolidMask[TILE_HEIGHT] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, @@ -264,7 +266,7 @@ static DWORD SolidMask[TILE_HEIGHT] = { 0xFFFFFFFF }; /** Used to mask out the left half of the tile diamond and only render additional content */ -static DWORD RightFoliageMask[TILE_HEIGHT] = { +const DWORD RightFoliageMask[TILE_HEIGHT] = { 0xFFFFFFFF, 0x3FFFFFFF, 0x0FFFFFFF, @@ -299,7 +301,7 @@ static DWORD RightFoliageMask[TILE_HEIGHT] = { 0x00000000, }; /** Used to mask out the left half of the tile diamond and only render additional content */ -static DWORD LeftFoliageMask[TILE_HEIGHT] = { +const DWORD LeftFoliageMask[TILE_HEIGHT] = { 0xFFFFFFFF, 0xFFFFFFFC, 0xFFFFFFF0, @@ -334,7 +336,7 @@ static DWORD LeftFoliageMask[TILE_HEIGHT] = { 0x00000000, }; -inline static int count_leading_zeros(DWORD mask) +inline int count_leading_zeros(DWORD mask) { // Note: This function assumes that the argument is not zero, // which means there is at least one bit set. @@ -372,23 +374,16 @@ void foreach_set_bit(DWORD mask, const F &f) } } -inline static void RenderLine(CelOutputBuffer out, int &x, int y, BYTE **src, int n, BYTE *tbl, DWORD mask) +inline void DoRenderLine(BYTE *dst, BYTE *src, int n, BYTE *tbl, DWORD mask) { - BYTE *dst = out.at(x, y); -#ifdef NO_OVERDRAW - if (y < SCREEN_Y || dst > out.end()) { - goto skip; - } -#endif - if (mask == 0xFFFFFFFF) { // Opaque line if (light_table_index == lightmax) { // Complete darkness memset(dst, 0, n); } else if (light_table_index == 0) { // Fully lit - memcpy(dst, *src, n); + memcpy(dst, src, n); } else { // Partially lit for (int i = 0; i < n; i++) { - dst[i] = tbl[(*src)[i]]; + dst[i] = tbl[src[i]]; } } } else { @@ -410,34 +405,42 @@ inline static void RenderLine(CelOutputBuffer out, int &x, int y, BYTE **src, in } else if (light_table_index == 0) { // Fully lit for (int i = 0; i < n; i++, mask <<= 1) { if (mask & 0x80000000) - dst[i] = (*src)[i]; + dst[i] = src[i]; else - dst[i] = paletteTransparencyLookup[dst[i]][(*src)[i]]; + dst[i] = paletteTransparencyLookup[dst[i]][src[i]]; } } else { // Partially lit for (int i = 0; i < n; i++, mask <<= 1) { if (mask & 0x80000000) - dst[i] = tbl[(*src)[i]]; + dst[i] = tbl[src[i]]; else - dst[i] = paletteTransparencyLookup[dst[i]][tbl[(*src)[i]]]; + dst[i] = paletteTransparencyLookup[dst[i]][tbl[src[i]]]; } } } else { // Stippled transparancy if (light_table_index == lightmax) { // Complete darkness foreach_set_bit(mask, [=](int i) { dst[i] = 0; }); } else if (light_table_index == 0) { // Fully lit - foreach_set_bit(mask, [=](int i) { dst[i] = (*src)[i]; }); + foreach_set_bit(mask, [=](int i) { dst[i] = src[i]; }); } else { // Partially lit - foreach_set_bit(mask, [=](int i) { dst[i] = tbl[(*src)[i]]; }); + foreach_set_bit(mask, [=](int i) { dst[i] = tbl[src[i]]; }); } } } +} -skip: +DVL_ATTRIBUTE_ALWAYS_INLINE +inline void RenderLine(BYTE *dst_begin, BYTE *dst_end, BYTE **dst, BYTE **src, int n, BYTE *tbl, DWORD mask) { +#ifdef NO_OVERDRAW + if (*dst >= dst_begin && *dst <= dst_end) +#endif + DoRenderLine(*dst, *src, n, tbl, mask); (*src) += n; - x += n; + (*dst) += n; } +} // namespace + #if defined(__clang__) || defined(__GNUC__) __attribute__((no_sanitize("shift-base"))) #endif @@ -447,7 +450,8 @@ void RenderTile(CelOutputBuffer out, int x, int y) int i, j; char c, v, tile; BYTE *src, *tbl; - DWORD m, *mask, *pFrameTable; + DWORD m, *pFrameTable; + const DWORD *mask; pFrameTable = (DWORD *)pDungeonCels; @@ -501,68 +505,72 @@ void RenderTile(CelOutputBuffer out, int x, int y) } #endif + BYTE *dst_begin = out.at(0, SCREEN_Y); + BYTE *dst_end = out.end(); + BYTE *dst = out.at(x, y); + const int dst_pitch = out.pitch(); switch (tile) { case RT_SQUARE: - for (i = TILE_HEIGHT; i != 0; --i, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2, tbl, *mask); + for (i = TILE_HEIGHT; i != 0; i--, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2, tbl, *mask); } break; case RT_TRANSPARENT: - for (i = TILE_HEIGHT; i != 0; --i, --y, x -= TILE_WIDTH / 2, mask--) { + for (i = TILE_HEIGHT; i != 0; i--, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { m = *mask; for (j = TILE_WIDTH / 2; j != 0; j -= v, v == TILE_WIDTH / 2 ? m = 0 : m <<= v) { v = *src++; if (v >= 0) { - RenderLine(out, x, y, &src, v, tbl, m); + RenderLine(dst_begin, dst_end, &dst, &src, v, tbl, m); } else { v = -v; - x += v; + dst += v; } } } break; case RT_LTRIANGLE: - for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, --y, x -= TILE_WIDTH / 2, mask--) { + for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { src += i & 2; - x += i; - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + dst += i; + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); } - for (i = 2; i != TILE_WIDTH / 2; i += 2, --y, x -= TILE_WIDTH / 2, mask--) { + for (i = 2; i != TILE_WIDTH / 2; i += 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { src += i & 2; - x += i; - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + dst += i; + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); } break; case RT_RTRIANGLE: - for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); src += i & 2; - x += i; + dst += i; } - for (i = 2; i != TILE_HEIGHT; i += 2, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + for (i = 2; i != TILE_HEIGHT; i += 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); src += i & 2; - x += i; + dst += i; } break; case RT_LTRAPEZOID: - for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, --y, x -= TILE_WIDTH / 2, mask--) { + for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { src += i & 2; - x += i; - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + dst += i; + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); } - for (i = TILE_HEIGHT / 2; i != 0; --i, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2, tbl, *mask); + for (i = TILE_HEIGHT / 2; i != 0; i--, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2, tbl, *mask); } break; case RT_RTRAPEZOID: - for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2 - i, tbl, *mask); + for (i = TILE_HEIGHT - 2; i >= 0; i -= 2, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2 - i, tbl, *mask); src += i & 2; - x += i; + dst += i; } - for (i = TILE_HEIGHT / 2; i != 0; i--, --y, x -= TILE_WIDTH / 2, mask--) { - RenderLine(out, x, y, &src, TILE_WIDTH / 2, tbl, *mask); + for (i = TILE_HEIGHT / 2; i != 0; i--, dst -= dst_pitch + TILE_WIDTH / 2, mask--) { + RenderLine(dst_begin, dst_end, &dst, &src, TILE_WIDTH / 2, tbl, *mask); } break; } diff --git a/defs.h b/defs.h index e7a532742..bae3e0a75 100644 --- a/defs.h +++ b/defs.h @@ -200,3 +200,15 @@ #define strcasecmp _stricmp #define strncasecmp _strnicmp #endif + +#ifdef __has_attribute +#define DVL_HAVE_ATTRIBUTE(x) __has_attribute(x) +#else +#define DVL_HAVE_ATTRIBUTE(x) 0 +#endif + +#if DVL_HAVE_ATTRIBUTE(always_inline) || (defined(__GNUC__) && !defined(__clang__)) +#define DVL_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define DVL_ATTRIBUTE_ALWAYS_INLINE +#endif