You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
182 lines
5.0 KiB
182 lines
5.0 KiB
/** |
|
* @file dc_video.cpp |
|
* @brief Dreamcast video conversion implementation |
|
* |
|
* The "Inner Loop" - this code runs 307,200 times per frame (640x480). |
|
* Every cycle counts! |
|
* |
|
* Optimization techniques used: |
|
* 1. Palette LUT fits in L1 cache (512 bytes for 256 RGB565 entries) |
|
* 2. Process 16 pixels at a time for throughput |
|
*/ |
|
|
|
#ifdef __DREAMCAST__ |
|
|
|
#include "dc_video.h" |
|
|
|
#include <kos.h> |
|
|
|
namespace devilution { |
|
namespace dc { |
|
|
|
namespace { |
|
|
|
// RGB565 palette lookup table (256 entries x 2 bytes = 512 bytes) |
|
// Aligned to 32 bytes for cache efficiency |
|
alignas(32) uint16_t palette565[256]; |
|
// 32-bit word lookup tables used by the packed conversion path. |
|
alignas(32) uint32_t palette565FirstWord[256]; |
|
alignas(32) uint32_t palette565SecondWord[256]; |
|
|
|
bool initialized = false; |
|
|
|
inline void UpdatePaletteEntry(int index, uint16_t rgb565) |
|
{ |
|
palette565[index] = rgb565; |
|
#if SDL_BYTEORDER == SDL_LIL_ENDIAN |
|
palette565FirstWord[index] = rgb565; |
|
palette565SecondWord[index] = static_cast<uint32_t>(rgb565) << 16; |
|
#else |
|
palette565FirstWord[index] = static_cast<uint32_t>(rgb565) << 16; |
|
palette565SecondWord[index] = rgb565; |
|
#endif |
|
} |
|
|
|
/** |
|
* @brief Convert 16 pixels from 8bpp to 16bpp |
|
* |
|
* This is the innermost loop - fully unrolled for speed. |
|
*/ |
|
inline void Convert16PixelsScalar(const uint8_t *src, uint16_t *dst) |
|
{ |
|
dst[0] = palette565[src[0]]; |
|
dst[1] = palette565[src[1]]; |
|
dst[2] = palette565[src[2]]; |
|
dst[3] = palette565[src[3]]; |
|
dst[4] = palette565[src[4]]; |
|
dst[5] = palette565[src[5]]; |
|
dst[6] = palette565[src[6]]; |
|
dst[7] = palette565[src[7]]; |
|
dst[8] = palette565[src[8]]; |
|
dst[9] = palette565[src[9]]; |
|
dst[10] = palette565[src[10]]; |
|
dst[11] = palette565[src[11]]; |
|
dst[12] = palette565[src[12]]; |
|
dst[13] = palette565[src[13]]; |
|
dst[14] = palette565[src[14]]; |
|
dst[15] = palette565[src[15]]; |
|
} |
|
|
|
/** |
|
* @brief Convert 16 pixels using packed 32-bit writes (2 pixels per store) |
|
* |
|
* SH4 is efficient at aligned 32-bit loads/stores, so this path halves |
|
* the number of destination stores compared to scalar 16-bit writes. |
|
*/ |
|
inline void Convert16PixelsPacked(const uint8_t *src, uint16_t *dst) |
|
{ |
|
#if defined(__SH4__) || defined(__sh__) |
|
// Pull upcoming source bytes into cache early on SH4. |
|
__builtin_prefetch(src + 32, 0, 3); |
|
__builtin_prefetch(src + 64, 0, 3); |
|
#endif |
|
uint32_t *dst32 = reinterpret_cast<uint32_t *>(dst); |
|
dst32[0] = palette565FirstWord[src[0]] | palette565SecondWord[src[1]]; |
|
dst32[1] = palette565FirstWord[src[2]] | palette565SecondWord[src[3]]; |
|
dst32[2] = palette565FirstWord[src[4]] | palette565SecondWord[src[5]]; |
|
dst32[3] = palette565FirstWord[src[6]] | palette565SecondWord[src[7]]; |
|
dst32[4] = palette565FirstWord[src[8]] | palette565SecondWord[src[9]]; |
|
dst32[5] = palette565FirstWord[src[10]] | palette565SecondWord[src[11]]; |
|
dst32[6] = palette565FirstWord[src[12]] | palette565SecondWord[src[13]]; |
|
dst32[7] = palette565FirstWord[src[14]] | palette565SecondWord[src[15]]; |
|
} |
|
|
|
void ConvertFrame(const uint8_t *src, uint16_t *dst, int width, int height, int srcPitch, int dstPitch) |
|
{ |
|
for (int y = 0; y < height; y++) { |
|
const uint8_t *srcRow = src + y * srcPitch; |
|
uint16_t *dstRow = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(dst) + y * dstPitch); |
|
const bool canUsePackedPath = (reinterpret_cast<uintptr_t>(dstRow) & (alignof(uint32_t) - 1)) == 0; |
|
|
|
int x = 0; |
|
if (canUsePackedPath) { |
|
for (; x + 16 <= width; x += 16) { |
|
Convert16PixelsPacked(srcRow + x, dstRow + x); |
|
} |
|
} else { |
|
for (; x + 16 <= width; x += 16) { |
|
Convert16PixelsScalar(srcRow + x, dstRow + x); |
|
} |
|
} |
|
|
|
for (; x < width; x++) { |
|
dstRow[x] = palette565[srcRow[x]]; |
|
} |
|
} |
|
} |
|
|
|
} // anonymous namespace |
|
|
|
bool VideoInit([[maybe_unused]] int width, [[maybe_unused]] int height) |
|
{ |
|
for (int i = 0; i < 256; i++) { |
|
UpdatePaletteEntry(i, RGB888toRGB565(i, i, i)); |
|
} |
|
|
|
initialized = true; |
|
return true; |
|
} |
|
|
|
void VideoShutdown() |
|
{ |
|
initialized = false; |
|
} |
|
|
|
void UpdatePalette(const SDL_Palette *palette) |
|
{ |
|
if (!palette || !palette->colors) |
|
return; |
|
|
|
UpdatePaletteRange(palette->colors, 0, palette->ncolors); |
|
} |
|
|
|
void UpdatePaletteRange(const SDL_Color *colors, int firstColor, int nColors) |
|
{ |
|
if (!colors) |
|
return; |
|
|
|
if (firstColor + nColors > 256) |
|
nColors = 256 - firstColor; |
|
|
|
for (int i = 0; i < nColors; i++) { |
|
const SDL_Color &c = colors[i]; |
|
UpdatePaletteEntry(firstColor + i, RGB888toRGB565(c.r, c.g, c.b)); |
|
} |
|
} |
|
|
|
void ConvertAndUpload(const SDL_Surface *src, SDL_Surface *dst) |
|
{ |
|
if (!initialized || !src || !dst) |
|
return; |
|
|
|
const uint8_t *srcPixels = static_cast<const uint8_t *>(src->pixels); |
|
uint16_t *dstPixels = static_cast<uint16_t *>(dst->pixels); |
|
|
|
if (!srcPixels || !dstPixels) |
|
return; |
|
|
|
const int width = src->w < dst->w ? src->w : dst->w; |
|
const int height = src->h < dst->h ? src->h : dst->h; |
|
|
|
ConvertFrame(srcPixels, dstPixels, width, height, src->pitch, dst->pitch); |
|
} |
|
|
|
bool IsInitialized() |
|
{ |
|
return initialized; |
|
} |
|
|
|
} // namespace dc |
|
} // namespace devilution |
|
|
|
#endif // __DREAMCAST__
|
|
|