Last active
March 6, 2017 04:23
-
-
Save Sir-Irk/be29d736381fb8b4486b92e04673fd04 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdlib.h> | |
| #include <assert.h> | |
| #include <stdint.h> | |
| #include <intrin.h> | |
| #include <emmintrin.h> | |
| #include <immintrin.h> | |
| #include <windows.h> | |
| #define JM_MATH_IMPLEMENTATION | |
| #define JM_MATH_USE_STDLIB | |
| #undef far | |
| #undef near | |
| #include "jm_math.h" | |
| #define STB_IMAGE_IMPLEMENTATION | |
| #define STB_IMAGE_WRITE_IMPLEMENTATION | |
| #include "stb_image.h" | |
| #include "stb_image_write.h" | |
| inline uint32_t | |
| uint32FromV4(v4 v) { | |
| return (uint32_t)v.r | ((uint32_t)v.g << 8) | ((uint32_t)v.b << 16) | ((uint32_t)v.a << 24); | |
| } | |
| LONGLONG globalPerfCountFrequency; | |
| inline LARGE_INTEGER | |
| Win32GetWallClock() | |
| { | |
| LARGE_INTEGER result; | |
| QueryPerformanceCounter(&result); | |
| return result; | |
| } | |
| inline r32 | |
| Win32GetSecondsElapsed(LARGE_INTEGER start, LARGE_INTEGER end) | |
| { | |
| r32 result = ((r32)(end.QuadPart - start.QuadPart) / r32(globalPerfCountFrequency)); | |
| return result; | |
| } | |
| #define StartTimer(name) LARGE_INTEGER name##startWallClock = Win32GetWallClock(); | |
| #define EndTimer(name) \ | |
| LARGE_INTEGER name##endWallClock = Win32GetWallClock(); \ | |
| r64 name##totalMs = Win32GetSecondsElapsed(name##startWallClock, name##endWallClock) * 1000.0f; \ | |
| fprintf(stdout, "%fms\n", name##totalMs); | |
| internal void | |
| basic_blur_range(uint32_t *writeBuffer, uint32_t *readBuffer, int32_t w, int32_t h, v2 *kernal, | |
| i32 kernalCount, int32_t startX = 0, int32_t endX = -1, int32_t startY = 0, int32_t endY = -1) | |
| { | |
| if(endX == -1) endX = w; | |
| if(endY == -1) endY = h; | |
| for(int32_t y = startY; y < endY; ++y) | |
| { | |
| for(int32_t x = startX; x < endX; ++x) | |
| { | |
| int32_t baseIndex = y * w + x; | |
| uint32_t startR = (readBuffer[baseIndex] & 0x000000FF) >> 0; | |
| uint32_t startG = (readBuffer[baseIndex] & 0x0000FF00) >> 8; | |
| uint32_t startB = (readBuffer[baseIndex] & 0x00FF0000) >> 16; | |
| int32_t pixelsContributed = 0; | |
| _declspec(align(64)) uint32_t accum[3] = {}; | |
| for(int32_t k = 0; k < kernalCount; ++k) | |
| { | |
| int32_t offsetX = x + kernal[k].x; | |
| int32_t offsetY = y + kernal[k].y; | |
| if(offsetX < 0 || offsetX >= w || offsetY < 0 || offsetY >= h) continue; | |
| int32_t index = offsetY * w + offsetX; | |
| accum[0] += (readBuffer[index] & 0x000000FF) >> 0; | |
| accum[1] += (readBuffer[index] & 0x0000FF00) >> 8; | |
| accum[2] += (readBuffer[index] & 0x00FF0000) >> 16; | |
| ++pixelsContributed; | |
| } | |
| switch(pixelsContributed) | |
| { | |
| case 0 : {} break; | |
| case 8 : | |
| { | |
| accum[0] >>= 3; | |
| accum[1] >>= 3; | |
| accum[2] >>= 3; | |
| } break; | |
| case 16 : | |
| { | |
| accum[0] >>= 4; | |
| accum[1] >>= 4; | |
| accum[2] >>= 4; | |
| } break; | |
| default : | |
| { | |
| r32 inv = 1.0f / pixelsContributed; | |
| accum[0] *= inv; | |
| accum[1] *= inv; | |
| accum[2] *= inv; | |
| } break; | |
| } | |
| uint32_t finalR = (startR + accum[0]) >> 1; | |
| uint32_t finalG = (startG + accum[1]) >> 1; | |
| uint32_t finalB = (startB + accum[2]) >> 1; | |
| writeBuffer[baseIndex] = finalR | (finalG << 8) | (finalB << 16) | (255 << 24); | |
| } | |
| } | |
| } | |
| static void | |
| basic_blur_pow2_kernal(uint32_t *outBuffer, uint32_t *buffer0, uint32_t *buffer1, i32 w, i32 h, i32 passes, v2 *kernal, i32 kernalCount, i32 power) | |
| { | |
| if(passes == 0) return; | |
| assert(buffer0 && buffer1 && outBuffer); | |
| int32_t totalPixels = w*h; | |
| b32 useBuffer0ForWrite = passes % 2; | |
| uint32_t *readBuffer; | |
| uint32_t *writeBuffer; | |
| uint32_t *buffers[2] = {buffer0, buffer1}; | |
| for(int32_t pass = 0; pass < passes; ++pass) | |
| { | |
| readBuffer = buffers[!useBuffer0ForWrite]; | |
| writeBuffer = buffers[ useBuffer0ForWrite]; | |
| useBuffer0ForWrite = !useBuffer0ForWrite; | |
| if(pass == passes - 1) writeBuffer = outBuffer; | |
| int32_t b = 2; | |
| #if 0 | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount); | |
| #else | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, 0, b); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, h-b, h); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, b, b, h-b); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, w-b, w, b, h-b); | |
| for(int32_t y = b; y < h-b; ++y) | |
| { | |
| for(int32_t x = b; x < w-b; ++x) | |
| { | |
| int32_t baseIndex = y * w + x; | |
| uint32_t startR = (readBuffer[baseIndex] & 0x000000FF) >> 0; | |
| uint32_t startG = (readBuffer[baseIndex] & 0x0000FF00) >> 8; | |
| uint32_t startB = (readBuffer[baseIndex] & 0x00FF0000) >> 16; | |
| _declspec(align(64)) uint32_t accum[4] = {}; | |
| for(int32_t k = 0; k < kernalCount; ++k) | |
| { | |
| int32_t index = (y + kernal[k].y) * w + (x + kernal[k].x); | |
| accum[0] += ((readBuffer[index] & 0x000000FF) >> 0); | |
| accum[1] += ((readBuffer[index] & 0x0000FF00) >> 8); | |
| accum[2] += ((readBuffer[index] & 0x00FF0000) >> 16); | |
| } | |
| uint32_t finalR = (startR + (accum[0] >> power)) >> 1; | |
| uint32_t finalG = (startG + (accum[1] >> power)) >> 1; | |
| uint32_t finalB = (startB + (accum[2] >> power)) >> 1; | |
| writeBuffer[baseIndex] = finalR | (finalG << 8) | (finalB << 16) | (255 << 24); | |
| } | |
| } | |
| #endif | |
| } | |
| } | |
| #if USE_AVX | |
| #define SIMD_INCREMENT 8 | |
| #define simd_int __m256i | |
| #define simd_set1_epi32(a) _mm256_set1_epi32(a) | |
| #define simd_setzero_ix() _mm256_setzero_si256() | |
| #define simd_setzero_rx() _mm256_setzero_ps() | |
| #define simd_and_ix(a, b) _mm256_and_si256(a, b) | |
| #define simd_or_ix(a, b) _mm256_or_si256(a, b) | |
| #define simd_add_epi32(a, b) _mm256_add_epi32(a, b) | |
| #define simd_sub_epi32(a, b) _mm256_sub_epi32(a, b) | |
| #define simd_loadu_ix(a) _mm256_loadu_si256(a) | |
| #define simd_storeu_ix(ptr, v) _mm256_storeu_si256(ptr, v) | |
| #define simd_srli_epi32(a, i) _mm256_srli_epi32(a, i) | |
| #define simd_slli_epi32(a, i) _mm256_slli_epi32(a, i) | |
| #else | |
| #define simd_int __m128i | |
| #define SIMD_INCREMENT 4 | |
| #define simd_set1_epi32(a) _mm_set1_epi32(a) | |
| #define simd_setzero_ix() _mm_setzero_si128() | |
| #define simd_setzero_rx() _mm_setzero_ps() | |
| #define simd_and_ix(a, b) _mm_and_si128(a, b) | |
| #define simd_or_ix(a, b) _mm_or_si128(a, b) | |
| #define simd_add_epi32(a, b) _mm_add_epi32(a, b) | |
| #define simd_sub_epi32(a, b) _mm_sub_epi32(a, b) | |
| #define simd_loadu_ix(a) _mm_loadu_si128(a) | |
| #define simd_storeu_ix(ptr, v) _mm_storeu_si128(ptr, v) | |
| #define simd_srli_epi32(a, i) _mm_srli_epi32(a, i) | |
| #define simd_slli_epi32(a, i) _mm_slli_epi32(a, i) | |
| #endif | |
| static void | |
| basic_blur_pow2_kernal_sse(uint32_t *outBuffer, uint32_t *buffer0, uint32_t *buffer1, | |
| i32 w, i32 h, i32 passes, v2 *kernal, i32 kernalCount, i32 power) | |
| { | |
| if(passes == 0) return; | |
| assert(buffer0 && buffer1 && outBuffer); | |
| int32_t totalPixels = w*h; | |
| b32 useBuffer0ForWrite = passes % 2; | |
| uint32_t *readBuffer; | |
| uint32_t *writeBuffer; | |
| uint32_t *buffers[2] = {buffer0, buffer1}; | |
| simd_int redMask = simd_set1_epi32(0x000000ff); | |
| simd_int greenMask = simd_set1_epi32(0x0000ff00); | |
| simd_int blueMask = simd_set1_epi32(0x00ff0000); | |
| int32_t halfKernalCount = kernalCount / 2; | |
| i32 offsetYs[16] = {}; | |
| for(i32 i = 0; i < 16; ++i) | |
| { | |
| offsetYs[i] = (i32)kernal[i].y * w; | |
| } | |
| simd_int accumR = simd_setzero_ix(); | |
| simd_int accumG = simd_setzero_ix(); | |
| simd_int accumB = simd_setzero_ix(); | |
| int32_t border = 2; | |
| i32 simdXMax = (w-border) - (SIMD_INCREMENT - 1); | |
| i32 leftOverX = (w-(border*2)) % SIMD_INCREMENT; | |
| for(int32_t pass = 0; pass < passes; ++pass) | |
| { | |
| readBuffer = buffers[!useBuffer0ForWrite]; | |
| writeBuffer = buffers[ useBuffer0ForWrite]; | |
| useBuffer0ForWrite = !useBuffer0ForWrite; | |
| if(pass == passes - 1) writeBuffer = outBuffer; | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, 0, border); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, w, h-border, h); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, 0, border, border, h-border); | |
| basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, w-border, w, border, h-border); | |
| for(int32_t y = border; y < h-border; ++y) | |
| { | |
| i32 yIndex = y * w; | |
| for(int32_t x = border; x < simdXMax; x += SIMD_INCREMENT) | |
| { | |
| int32_t baseIndex = y * w + x; | |
| simd_int colors = simd_loadu_ix((simd_int *)&readBuffer[baseIndex]); | |
| simd_int startRs = simd_and_ix(colors, redMask); | |
| simd_int startGs = simd_and_ix(colors, greenMask); | |
| simd_int startBs = simd_and_ix(colors, blueMask); | |
| accumR = accumG = accumB = simd_setzero_ix(); | |
| for(int32_t k = 0; k < kernalCount; ++k) | |
| { | |
| i32 yOffset = (yIndex + offsetYs[k]); | |
| i32 index0 = yOffset + x + kernal[k].x; | |
| simd_int colors = simd_loadu_ix((simd_int *)&readBuffer[index0]); | |
| accumR = simd_add_epi32(accumR, simd_and_ix(colors, redMask)); | |
| accumG = simd_add_epi32(accumG, simd_and_ix(colors, greenMask)); | |
| accumB = simd_add_epi32(accumB, simd_and_ix(colors, blueMask)); | |
| } | |
| simd_int finalRs = simd_srli_epi32(accumR, power); | |
| simd_int finalGs = simd_srli_epi32(accumG, power); | |
| simd_int finalBs = simd_srli_epi32(accumB, power); | |
| finalRs = simd_srli_epi32(simd_add_epi32(startRs, finalRs), 1); | |
| finalGs = simd_srli_epi32(simd_add_epi32(startGs, finalGs), 1); | |
| finalBs = simd_srli_epi32(simd_add_epi32(startBs, finalBs), 1); | |
| finalRs = simd_and_ix(finalRs, simd_set1_epi32(0x000000ff)); | |
| finalGs = simd_and_ix(finalGs, simd_set1_epi32(0x0000ff00)); | |
| finalBs = simd_and_ix(finalBs, simd_set1_epi32(0x00ff0000)); | |
| simd_int finalColors = simd_or_ix(simd_or_ix(simd_or_ix(finalRs, finalGs), finalBs), simd_set1_epi32(255 << 24)); | |
| simd_storeu_ix((simd_int *)&writeBuffer[baseIndex], finalColors); | |
| } | |
| } | |
| //NOTE: blur from the remaining columns of pixels up to the already blurred border. | |
| if(leftOverX) basic_blur_range(writeBuffer, readBuffer, w, h, kernal, kernalCount, simdXMax, w-border, border, h-border); | |
| } | |
| } | |
| static void | |
| blend_uniform_images(uint32_t *outPixels, uint32_t *inPixels0, uint32_t *inPixels1, int32_t numPixels) | |
| { | |
| for(int32_t i = 0; i < numPixels; ++i) | |
| { | |
| uint32_t p0 = ((uint32_t *)inPixels0)[i]; | |
| uint32_t p1 = ((uint32_t *)inPixels1)[i]; | |
| uint32_t r0 = (p0 & 0x000000FF) >> 0; | |
| uint32_t g0 = (p0 & 0x0000FF00) >> 8; | |
| uint32_t b0 = (p0 & 0x00FF0000) >> 16; | |
| uint32_t r1 = (p1 & 0x000000FF) >> 0; | |
| uint32_t g1 = (p1 & 0x0000FF00) >> 8; | |
| uint32_t b1 = (p1 & 0x00FF0000) >> 16; | |
| uint32_t red = (r0 + r1) >> 1; | |
| uint32_t green = (g0 + g1) >> 1; | |
| uint32_t blue = (b0 + b1) >> 1; | |
| outPixels[i] = red | (green << 8) | (blue << 16) | (255 << 24); | |
| } | |
| } | |
| int main(void) | |
| { | |
| LARGE_INTEGER perfCountFrequencyResult; | |
| QueryPerformanceFrequency(&perfCountFrequencyResult); | |
| globalPerfCountFrequency = perfCountFrequencyResult.QuadPart; | |
| int32_t w0, w1, h0, h1; | |
| uint8_t *loadedPixels0 = stbi_load("galax7.png", &w0, &h0, 0, 4); | |
| uint8_t *loadedPixels1 = stbi_load("galax7.png", &w1, &h1, 0, 4); | |
| uint32_t *outPixels = (uint32_t *)calloc(w0*h0, sizeof(uint32_t)); | |
| uint32_t *blurBuffer = (uint32_t *)calloc(w0*h0, sizeof(uint32_t)); | |
| assert(loadedPixels0 && loadedPixels1 && outPixels && blurBuffer); | |
| //blend_uniform_images(outPixels, (uint32_t *)loadedPixels0, (uint32_t *)loadedPixels1, w0*h0); | |
| memcpy(outPixels, loadedPixels0, sizeof(uint32_t) * w0*h0); | |
| memcpy(blurBuffer, outPixels, sizeof(uint32_t) * w0*h0); | |
| uint64_t startCycleCounter = __rdtsc(); | |
| StartTimer(Fuck); | |
| v2 blurKernalSmall[8] = { | |
| {-1, -1}, {0, -1}, {1, -1}, | |
| {-1, -0}, {1, -0}, | |
| {-1, 1}, {0, 1}, {1, 1}, | |
| }; | |
| v2 blurKernal16[16] = { | |
| {-1, -2}, {0, -2}, {1, -2}, | |
| {-1, -1}, {0, -1}, {1, -1}, | |
| {-2, 0}, {-1, -0}, {1, -0}, {2, 0}, | |
| {-1, 1}, {0, 1}, {1, 1}, | |
| {-1, 2}, {0, 2}, {1, 2} | |
| }; | |
| //basic_blur_pow2_kernal(outPixels, outPixels, blurBuffer, w0, h0, 32, blurKernal16, sizeof(blurKernal16)/sizeof(blurKernal16[0]), 4); | |
| basic_blur_pow2_kernal_sse(outPixels, outPixels, blurBuffer, w0, h0, 32, blurKernal16, sizeof(blurKernal16)/sizeof(blurKernal16[0]), 4); | |
| uint64_t endCycleCounter = __rdtsc(); | |
| int64_t cycleCount = endCycleCounter - startCycleCounter; | |
| printf("\n %f MC | ", (double)cycleCount * 0.000001f); | |
| EndTimer(Fuck); | |
| stbi_write_png("nice03.png", w1, h1, 4, outPixels, 0); | |
| free(loadedPixels0); | |
| free(loadedPixels1); | |
| free(outPixels); | |
| free(blurBuffer); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment