Skip to content

Instantly share code, notes, and snippets.

@Sir-Irk
Last active March 6, 2017 04:23
Show Gist options
  • Select an option

  • Save Sir-Irk/be29d736381fb8b4486b92e04673fd04 to your computer and use it in GitHub Desktop.

Select an option

Save Sir-Irk/be29d736381fb8b4486b92e04673fd04 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <assert.h>
#include <stdint.h>
#include <intrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#include <windows.h>
#define JM_MATH_IMPLEMENTATION
#define JM_MATH_USE_STDLIB
#undef far
#undef near
#include "jm_math.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
inline uint32_t
uint32FromV4(v4 v) {
return (uint32_t)v.r | ((uint32_t)v.g << 8) | ((uint32_t)v.b << 16) | ((uint32_t)v.a << 24);
}
LONGLONG globalPerfCountFrequency;
inline LARGE_INTEGER
Win32GetWallClock()
{
LARGE_INTEGER result;
QueryPerformanceCounter(&result);
return result;
}
inline r32
Win32GetSecondsElapsed(LARGE_INTEGER start, LARGE_INTEGER end)
{
r32 result = ((r32)(end.QuadPart - start.QuadPart) / r32(globalPerfCountFrequency));
return result;
}
#define StartTimer(name) LARGE_INTEGER name##startWallClock = Win32GetWallClock();
#define EndTimer(name) \
LARGE_INTEGER name##endWallClock = Win32GetWallClock(); \
r64 name##totalMs = Win32GetSecondsElapsed(name##startWallClock, name##endWallClock) * 1000.0f; \
fprintf(stdout, "%fms\n", name##totalMs);
internal void
basic_blur_range(uint32_t *writeBuffer, uint32_t *readBuffer, int32_t w, int32_t h, v2 *kernal,
i32 kernalCount, int32_t startX = 0, int32_t endX = -1, int32_t startY = 0, int32_t endY = -1)
{
if(endX == -1) endX = w;
if(endY == -1) endY = h;
for(int32_t y = startY; y < endY; ++y)
{
for(int32_t x = startX; x < endX; ++x)
{
int32_t baseIndex = y * w + x;
uint32_t startR = (readBuffer[baseIndex] & 0x000000FF) >> 0;
uint32_t startG = (readBuffer[baseIndex] & 0x0000FF00) >> 8;
uint32_t startB = (readBuffer[baseIndex] & 0x00FF0000) >> 16;
int32_t pixelsContributed = 0;
_declspec(align(64)) uint32_t accum[3] = {};
for(int32_t k = 0; k < kernalCount; ++k)
{
int32_t offsetX = x + kernal[k].x;
int32_t offsetY = y + kernal[k].y;
if(offsetX < 0 || offsetX >= w || offsetY < 0 || offsetY >= h) continue;
int32_t index = offsetY * w + offsetX;
accum[0] += (readBuffer[index] & 0x000000FF) >> 0;
accum[1] += (readBuffer[index] & 0x0000FF00) >> 8;
accum[2] += (readBuffer[index] & 0x00FF0000) >> 16;
++pixelsContributed;
}
switch(pixelsContributed)
{
case 0 : {} break;
case 8 :
{
accum[0] >>= 3;
accum[1] >>= 3;
accum[2] >>= 3;
} break;
case 16 :
{
accum[0] >>= 4;
accum[1] >>= 4;
accum[2] >>= 4;
} break;
default :
{
r32 inv = 1.0f / pixelsContributed;
accum[0] *= inv;
accum[1] *= inv;
accum[2] *= inv;
} break;
}
uint32_t finalR = (startR + accum[0]) >> 1;
uint32_t finalG = (startG + accum[1]) >> 1;
uint32_t finalB = (startB + accum[2]) >> 1;
writeBuffer[baseIndex] = finalR | (finalG << 8) | (finalB << 16) | (255 << 24);
}
}
}
static void
basic_blur_pow2_kernal(uint32_t *outBuffer, uint32_t *buffer0, uint32_t *buffer1, i32 w, i32 h, i32 passes, v2 *kernal, i32 kernalCount, i32 power)
{
if(passes == 0) return;
assert(buffer0 && buffer1 && outBuffer);
int32_t totalPixels = w*h;
b32 useBuffer0ForWrite = passes % 2;
uint32_t *readBuffer;
uint32_t *writeBuffer;
uint32_t *buffers[2] = {buffer0, buffer1};
for(int32_t pass = 0; pass < passes; ++pass)
{
readBuffer = buffers[!useBuffer0ForWrite];
writeBuffer = buffers[ useBuffer0ForWrite];
useBuffer0ForWrite = !useBuffer0ForWrite;
if(pass == passes - 1) writeBuffer = outBuffer;
int32_t b = 2;
#if 0
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount);
#else
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, 0, b);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, h-b, h);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, b, b, h-b);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, w-b, w, b, h-b);
for(int32_t y = b; y < h-b; ++y)
{
for(int32_t x = b; x < w-b; ++x)
{
int32_t baseIndex = y * w + x;
uint32_t startR = (readBuffer[baseIndex] & 0x000000FF) >> 0;
uint32_t startG = (readBuffer[baseIndex] & 0x0000FF00) >> 8;
uint32_t startB = (readBuffer[baseIndex] & 0x00FF0000) >> 16;
_declspec(align(64)) uint32_t accum[4] = {};
for(int32_t k = 0; k < kernalCount; ++k)
{
int32_t index = (y + kernal[k].y) * w + (x + kernal[k].x);
accum[0] += ((readBuffer[index] & 0x000000FF) >> 0);
accum[1] += ((readBuffer[index] & 0x0000FF00) >> 8);
accum[2] += ((readBuffer[index] & 0x00FF0000) >> 16);
}
uint32_t finalR = (startR + (accum[0] >> power)) >> 1;
uint32_t finalG = (startG + (accum[1] >> power)) >> 1;
uint32_t finalB = (startB + (accum[2] >> power)) >> 1;
writeBuffer[baseIndex] = finalR | (finalG << 8) | (finalB << 16) | (255 << 24);
}
}
#endif
}
}
#if USE_AVX
#define SIMD_INCREMENT 8
#define simd_int __m256i
#define simd_set1_epi32(a) _mm256_set1_epi32(a)
#define simd_setzero_ix() _mm256_setzero_si256()
#define simd_setzero_rx() _mm256_setzero_ps()
#define simd_and_ix(a, b) _mm256_and_si256(a, b)
#define simd_or_ix(a, b) _mm256_or_si256(a, b)
#define simd_add_epi32(a, b) _mm256_add_epi32(a, b)
#define simd_sub_epi32(a, b) _mm256_sub_epi32(a, b)
#define simd_loadu_ix(a) _mm256_loadu_si256(a)
#define simd_storeu_ix(ptr, v) _mm256_storeu_si256(ptr, v)
#define simd_srli_epi32(a, i) _mm256_srli_epi32(a, i)
#define simd_slli_epi32(a, i) _mm256_slli_epi32(a, i)
#else
#define simd_int __m128i
#define SIMD_INCREMENT 4
#define simd_set1_epi32(a) _mm_set1_epi32(a)
#define simd_setzero_ix() _mm_setzero_si128()
#define simd_setzero_rx() _mm_setzero_ps()
#define simd_and_ix(a, b) _mm_and_si128(a, b)
#define simd_or_ix(a, b) _mm_or_si128(a, b)
#define simd_add_epi32(a, b) _mm_add_epi32(a, b)
#define simd_sub_epi32(a, b) _mm_sub_epi32(a, b)
#define simd_loadu_ix(a) _mm_loadu_si128(a)
#define simd_storeu_ix(ptr, v) _mm_storeu_si128(ptr, v)
#define simd_srli_epi32(a, i) _mm_srli_epi32(a, i)
#define simd_slli_epi32(a, i) _mm_slli_epi32(a, i)
#endif
static void
basic_blur_pow2_kernal_sse(uint32_t *outBuffer, uint32_t *buffer0, uint32_t *buffer1,
i32 w, i32 h, i32 passes, v2 *kernal, i32 kernalCount, i32 power)
{
if(passes == 0) return;
assert(buffer0 && buffer1 && outBuffer);
int32_t totalPixels = w*h;
b32 useBuffer0ForWrite = passes % 2;
uint32_t *readBuffer;
uint32_t *writeBuffer;
uint32_t *buffers[2] = {buffer0, buffer1};
simd_int redMask = simd_set1_epi32(0x000000ff);
simd_int greenMask = simd_set1_epi32(0x0000ff00);
simd_int blueMask = simd_set1_epi32(0x00ff0000);
int32_t halfKernalCount = kernalCount / 2;
i32 offsetYs[16] = {};
for(i32 i = 0; i < 16; ++i)
{
offsetYs[i] = (i32)kernal[i].y * w;
}
simd_int accumR = simd_setzero_ix();
simd_int accumG = simd_setzero_ix();
simd_int accumB = simd_setzero_ix();
int32_t border = 2;
i32 simdXMax = (w-border) - (SIMD_INCREMENT - 1);
i32 leftOverX = (w-(border*2)) % SIMD_INCREMENT;
for(int32_t pass = 0; pass < passes; ++pass)
{
readBuffer = buffers[!useBuffer0ForWrite];
writeBuffer = buffers[ useBuffer0ForWrite];
useBuffer0ForWrite = !useBuffer0ForWrite;
if(pass == passes - 1) writeBuffer = outBuffer;
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, 0, border);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, h-border, h);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, border, border, h-border);
basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, w-border, w, border, h-border);
for(int32_t y = border; y < h-border; ++y)
{
i32 yIndex = y * w;
for(int32_t x = border; x < simdXMax; x += SIMD_INCREMENT)
{
int32_t baseIndex = y * w + x;
simd_int colors = simd_loadu_ix((simd_int *)&readBuffer[baseIndex]);
simd_int startRs = simd_and_ix(colors, redMask);
simd_int startGs = simd_and_ix(colors, greenMask);
simd_int startBs = simd_and_ix(colors, blueMask);
accumR = accumG = accumB = simd_setzero_ix();
for(int32_t k = 0; k < kernalCount; ++k)
{
i32 yOffset = (yIndex + offsetYs[k]);
i32 index0 = yOffset + x + kernal[k].x;
simd_int colors = simd_loadu_ix((simd_int *)&readBuffer[index0]);
accumR = simd_add_epi32(accumR, simd_and_ix(colors, redMask));
accumG = simd_add_epi32(accumG, simd_and_ix(colors, greenMask));
accumB = simd_add_epi32(accumB, simd_and_ix(colors, blueMask));
}
simd_int finalRs = simd_srli_epi32(accumR, power);
simd_int finalGs = simd_srli_epi32(accumG, power);
simd_int finalBs = simd_srli_epi32(accumB, power);
finalRs = simd_srli_epi32(simd_add_epi32(startRs, finalRs), 1);
finalGs = simd_srli_epi32(simd_add_epi32(startGs, finalGs), 1);
finalBs = simd_srli_epi32(simd_add_epi32(startBs, finalBs), 1);
finalRs = simd_and_ix(finalRs, simd_set1_epi32(0x000000ff));
finalGs = simd_and_ix(finalGs, simd_set1_epi32(0x0000ff00));
finalBs = simd_and_ix(finalBs, simd_set1_epi32(0x00ff0000));
simd_int finalColors = simd_or_ix(simd_or_ix(simd_or_ix(finalRs, finalGs), finalBs), simd_set1_epi32(255 << 24));
simd_storeu_ix((simd_int *)&writeBuffer[baseIndex], finalColors);
}
}
//NOTE: blur from the remaining columns of pixels up to the already blurred border.
if(leftOverX) basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, simdXMax, w-border, border, h-border);
}
}
static void
blend_uniform_images(uint32_t *outPixels, uint32_t *inPixels0, uint32_t *inPixels1, int32_t numPixels)
{
for(int32_t i = 0; i < numPixels; ++i)
{
uint32_t p0 = ((uint32_t *)inPixels0)[i];
uint32_t p1 = ((uint32_t *)inPixels1)[i];
uint32_t r0 = (p0 & 0x000000FF) >> 0;
uint32_t g0 = (p0 & 0x0000FF00) >> 8;
uint32_t b0 = (p0 & 0x00FF0000) >> 16;
uint32_t r1 = (p1 & 0x000000FF) >> 0;
uint32_t g1 = (p1 & 0x0000FF00) >> 8;
uint32_t b1 = (p1 & 0x00FF0000) >> 16;
uint32_t red = (r0 + r1) >> 1;
uint32_t green = (g0 + g1) >> 1;
uint32_t blue = (b0 + b1) >> 1;
outPixels[i] = red | (green << 8) | (blue << 16) | (255 << 24);
}
}
int main(void)
{
LARGE_INTEGER perfCountFrequencyResult;
QueryPerformanceFrequency(&perfCountFrequencyResult);
globalPerfCountFrequency = perfCountFrequencyResult.QuadPart;
int32_t w0, w1, h0, h1;
uint8_t *loadedPixels0 = stbi_load("galax7.png", &w0, &h0, 0, 4);
uint8_t *loadedPixels1 = stbi_load("galax7.png", &w1, &h1, 0, 4);
uint32_t *outPixels = (uint32_t *)calloc(w0*h0, sizeof(uint32_t));
uint32_t *blurBuffer = (uint32_t *)calloc(w0*h0, sizeof(uint32_t));
assert(loadedPixels0 && loadedPixels1 && outPixels && blurBuffer);
//blend_uniform_images(outPixels, (uint32_t *)loadedPixels0, (uint32_t *)loadedPixels1, w0*h0);
memcpy(outPixels, loadedPixels0, sizeof(uint32_t) * w0*h0);
memcpy(blurBuffer, outPixels, sizeof(uint32_t) * w0*h0);
uint64_t startCycleCounter = __rdtsc();
StartTimer(Fuck);
v2 blurKernalSmall[8] = {
{-1, -1}, {0, -1}, {1, -1},
{-1, -0}, {1, -0},
{-1, 1}, {0, 1}, {1, 1},
};
v2 blurKernal16[16] = {
{-1, -2}, {0, -2}, {1, -2},
{-1, -1}, {0, -1}, {1, -1},
{-2, 0}, {-1, -0}, {1, -0}, {2, 0},
{-1, 1}, {0, 1}, {1, 1},
{-1, 2}, {0, 2}, {1, 2}
};
//basic_blur_pow2_kernal(outPixels, outPixels, blurBuffer, w0, h0, 32, blurKernal16, sizeof(blurKernal16)/sizeof(blurKernal16[0]), 4);
basic_blur_pow2_kernal_sse(outPixels, outPixels, blurBuffer, w0, h0, 32, blurKernal16, sizeof(blurKernal16)/sizeof(blurKernal16[0]), 4);
uint64_t endCycleCounter = __rdtsc();
int64_t cycleCount = endCycleCounter - startCycleCounter;
printf("\n %f MC | ", (double)cycleCount * 0.000001f);
EndTimer(Fuck);
stbi_write_png("nice03.png", w1, h1, 4, outPixels, 0);
free(loadedPixels0);
free(loadedPixels1);
free(outPixels);
free(blurBuffer);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment