Last active
December 28, 2025 13:49
-
-
Save chikuzen/84366106f0a84a981d50355315a45c47 to your computer and use it in GitHub Desktop.
除算と逆数の乗算の速度比較
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cstdio> | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <chrono> | |
| #include <type_traits> | |
| #include <immintrin.h> | |
| using namespace std::chrono; | |
| #ifndef _WIN32 | |
| #define __forceinline inline __attribute__((always_inline)) | |
| #endif | |
| static void edge_detection(float* srcp, float* gx, float* gy, size_t stride, size_t height) | |
| { | |
| memset(gx, 0, stride * sizeof(float)); | |
| memset(gy, 0, stride * sizeof(float)); | |
| for (size_t y = 1; y < height - 1; ++y) { | |
| gx += stride; | |
| gy += stride; | |
| srcp += stride; | |
| gx[0] = 0; | |
| gy[0] = 0; | |
| for (size_t x = 1; x < stride - 1; ++x) { | |
| gx[x] = srcp[x - 1] - srcp[x + 1]; | |
| gy[x] = srcp[x - stride] - srcp[x + stride]; | |
| } | |
| gx[stride - 1] = 0; | |
| gy[stride - 1] = 0; | |
| } | |
| memset(gx + stride, 0, stride * sizeof(float)); | |
| memset(gy + stride, 0, stride * sizeof(float)); | |
| } | |
| template <typename T> | |
| static __forceinline T load(const float* ptr) | |
| { | |
| if constexpr (std::is_same_v< T, __m128 >) { | |
| return _mm_load_ps(ptr); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v< T, __m256 >) { | |
| return _mm256_load_ps(ptr); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v< T, __m512 >) { | |
| return _mm512_load_ps(ptr); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline T mul(const T& a, const T& b) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| return _mm_mul_ps(a, b); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| return _mm256_mul_ps(a, b); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| return _mm512_mul_ps(a, b); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline T div(const T& a, const T& b) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| return _mm_div_ps(a, b); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| return _mm256_div_ps(a, b); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| return _mm512_div_ps(a, b); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline T add(const T& a, const T& b) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| return _mm_add_ps(a, b); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| return _mm256_add_ps(a, b); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| return _mm512_add_ps(a, b); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline T sub(const T& a, const T& b) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| return _mm_sub_ps(a, b); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| return _mm256_sub_ps(a, b); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| return _mm512_sub_ps(a, b); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline T _rcp(const T& a) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| return _mm_rcp_ps(a); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| return _mm256_rcp_ps(a); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| return _mm512_rcp14_ps(a); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| static __forceinline void store(float* ptr, const T& v) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| _mm_store_ps(ptr, v); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| _mm256_store_ps(ptr, v); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| _mm512_store_ps(ptr, v); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| int64_t calc_tan0(const float* gx, const float* gy, float* dstp, size_t stride, size_t height) | |
| { | |
| auto start = system_clock::now(); | |
| for (size_t y = 1; y < height - 1; ++y) { | |
| gx += stride; | |
| gy += stride; | |
| dstp += stride; | |
| for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) { | |
| T vx = load<T>(gx + x); | |
| T vy = load<T>(gy + x); | |
| store<T>(dstp + x, div<T>(vx, vy)); | |
| } | |
| } | |
| auto end = system_clock::now(); | |
| return duration_cast<nanoseconds>(end - start).count(); | |
| } | |
| template <typename T> | |
| int64_t calc_tan1(const float* gx, const float* gy, float* dstp, size_t stride, size_t height) | |
| { | |
| auto start = system_clock::now(); | |
| for (size_t y = 1; y < height - 1; ++y) { | |
| gx += stride; | |
| gy += stride; | |
| dstp += stride; | |
| for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) { | |
| T vx = load<T>(gx + x); | |
| T vy = load<T>(gy + x); | |
| T rcp = _rcp<T>(vx); | |
| store<T>(dstp + x, mul<T>(rcp, vy)); | |
| } | |
| } | |
| auto end = system_clock::now(); | |
| return duration_cast<nanoseconds>(end - start).count(); | |
| } | |
| template <typename T> | |
| static __forceinline T rcphq(const T& a) | |
| { | |
| if constexpr (std::is_same_v<T, __m128>) { | |
| __m128 rcp = _rcp(a); | |
| return sub(add(rcp, rcp), mul(a, mul(rcp, rcp))); | |
| } | |
| #ifdef __AVX2__ | |
| else if constexpr (std::is_same_v<T, __m256>) { | |
| static const __m256 two = _mm256_set1_ps(2.0f); | |
| __m256 rcp = _rcp(a); | |
| return mul(rcp, _mm256_fnmadd_ps(a, rcp, two)); | |
| } | |
| #ifdef __AVX512F__ | |
| else if constexpr (std::is_same_v<T, __m512>) { | |
| static const __m512 two = _mm512_set1_ps(2.0f); | |
| __m512 rcp = _rcp(a); | |
| return mul(rcp, _mm512_fnmadd_ps(a, rcp, two)); | |
| } | |
| #endif | |
| #endif | |
| } | |
| template <typename T> | |
| int64_t calc_tan2(const float* gx, const float* gy, float* dstp, size_t stride, size_t height) | |
| { | |
| auto start = system_clock::now(); | |
| for (size_t y = 1; y < height - 1; ++y) { | |
| gx += stride; | |
| gy += stride; | |
| dstp += stride; | |
| for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) { | |
| T vx = load<T>(gx + x); | |
| T vy = load<T>(gy + x); | |
| T rcp = rcphq(vx); | |
| store<T>(dstp + x, mul<T>(rcp, vy)); | |
| } | |
| } | |
| auto end = system_clock::now(); | |
| return duration_cast<nanoseconds>(end - start).count(); | |
| } | |
| int main(void) | |
| { | |
| constexpr size_t stride = 1920; | |
| constexpr size_t height = 1080; | |
| constexpr size_t framesize = stride * height; | |
| void* buff = _mm_malloc(framesize * sizeof(float) * 6, 64); | |
| if (!buff) exit(1); | |
| float* srcp = reinterpret_cast<float*>(buff); | |
| float* gx = srcp + framesize; | |
| float* gy = gx + framesize; | |
| float* tan0 = gy + framesize; | |
| float* tan1 = tan0 + framesize; | |
| float* tan2 = tan1 + framesize; | |
| FILE* fp = fopen("./cr_float_1920x1080.raw", "rb"); | |
| if (!fp) exit(1); | |
| fread(srcp, sizeof(float), framesize, fp); | |
| fclose(fp); | |
| edge_detection(srcp, gx, gy, stride, height); | |
| int64_t t0 = 0, t1 = 0, t2 = 0; | |
| for (int i = 0; i < 10000; ++i) { | |
| t0 += calc_tan0<__m128>(gx, gy, tan0, stride, height); | |
| t1 += calc_tan1<__m128>(gx, gy, tan1, stride, height); | |
| t2 += calc_tan2<__m128>(gx, gy, tan2, stride, height); | |
| } | |
| puts(std::format("__m128 t0: {}", t0).c_str()); | |
| puts(std::format("__m128 t1: {}", t1).c_str()); | |
| puts(std::format("__m128 t2: {}", t2).c_str()); | |
| puts(""); | |
| #ifdef __AVX2__ | |
| t0 = 0, t1 = 0, t2 = 0; | |
| for (int i = 0; i < 10000; ++i) { | |
| t0 += calc_tan0<__m256>(gx, gy, tan0, stride, height); | |
| t1 += calc_tan1<__m256>(gx, gy, tan1, stride, height); | |
| t2 += calc_tan2<__m256>(gx, gy, tan2, stride, height); | |
| } | |
| puts(std::format("__m256 t0: {}", t0).c_str()); | |
| puts(std::format("__m256 t1: {}", t1).c_str()); | |
| puts(std::format("__m256 t2: {}", t2).c_str()); | |
| puts(""); | |
| #endif | |
| #ifdef __AVX512F__ | |
| t0 = 0, t1 = 0, t2 = 0; | |
| for (int i = 0; i < 10000; ++i) { | |
| t0 += calc_tan0<__m512>(gx, gy, tan0, stride, height); | |
| t1 += calc_tan1<__m512>(gx, gy, tan1, stride, height); | |
| t2 += calc_tan2<__m512>(gx, gy, tan2, stride, height); | |
| } | |
| puts(std::format("__m512 t0: {}", t0).c_str()); | |
| puts(std::format("__m512 t1: {}", t1).c_str()); | |
| puts(std::format("__m512 t2: {}", t2).c_str()); | |
| puts(""); | |
| #endif | |
| _mm_free(buff); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| run benchmark 10 times on Intel Core i7 4770 | |
| __m128 t0: 16141603900 | |
| __m128 t1: 16006334100 | |
| __m128 t2: 16458018300 | |
| __m256 t0: 16232862700 | |
| __m256 t1: 16341678800 | |
| __m256 t2: 17506229300 | |
| __m128 t0: 16128331900 | |
| __m128 t1: 16012951800 | |
| __m128 t2: 16281443300 | |
| __m256 t0: 16234015800 | |
| __m256 t1: 16360998000 | |
| __m256 t2: 17222480900 | |
| __m128 t0: 16320674000 | |
| __m128 t1: 16245989400 | |
| __m128 t2: 16469393100 | |
| __m256 t0: 16417865900 | |
| __m256 t1: 16526099500 | |
| __m256 t2: 17385912800 | |
| __m128 t0: 16176071800 | |
| __m128 t1: 16158048900 | |
| __m128 t2: 16389021200 | |
| __m256 t0: 16253484600 | |
| __m256 t1: 16464722100 | |
| __m256 t2: 17305499100 | |
| __m128 t0: 16152353600 | |
| __m128 t1: 16014629700 | |
| __m128 t2: 16377454700 | |
| __m256 t0: 16249665800 | |
| __m256 t1: 16344594300 | |
| __m256 t2: 17280650800 | |
| __m128 t0: 16141269300 | |
| __m128 t1: 16135545600 | |
| __m128 t2: 16494164700 | |
| __m256 t0: 16131817300 | |
| __m256 t1: 16393160300 | |
| __m256 t2: 17194029200 | |
| __m128 t0: 16089997000 | |
| __m128 t1: 16033717000 | |
| __m128 t2: 16364618000 | |
| __m256 t0: 16179000900 | |
| __m256 t1: 16365154100 | |
| __m256 t2: 17253009000 | |
| __m128 t0: 16179979000 | |
| __m128 t1: 16052390100 | |
| __m128 t2: 16445640500 | |
| __m256 t0: 16222803600 | |
| __m256 t1: 16326864500 | |
| __m256 t2: 17286472200 | |
| __m128 t0: 16177169600 | |
| __m128 t1: 15996007000 | |
| __m128 t2: 16322758500 | |
| __m256 t0: 16278922500 | |
| __m256 t1: 16345059400 | |
| __m256 t2: 17250139300 | |
| __m128 t0: 16157409800 | |
| __m128 t1: 16038530800 | |
| __m128 t2: 16390106900 | |
| __m256 t0: 16559159400 | |
| __m256 t1: 16714210000 | |
| __m256 t2: 17722736200 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| run benchmark on Intel Core-i7 7700HQ (laptop) | |
| __m128 t0: 11492445700 | |
| __m128 t1: 11511440100 | |
| __m128 t2: 12243317700 | |
| __m256 t0: 11233625700 | |
| __m256 t1: 11075939700 | |
| __m256 t2: 12691638000 | |
| __m128 t0: 11489163200 | |
| __m128 t1: 11522478200 | |
| __m128 t2: 12288589600 | |
| __m256 t0: 11188765500 | |
| __m256 t1: 11132576900 | |
| __m256 t2: 12756802500 | |
| __m128 t0: 11872450200 | |
| __m128 t1: 11932667000 | |
| __m128 t2: 12679231200 | |
| __m256 t0: 11225789800 | |
| __m256 t1: 11152940300 | |
| __m256 t2: 12715785800 | |
| __m128 t0: 11405799700 | |
| __m128 t1: 11718692900 | |
| __m128 t2: 12216355900 | |
| __m256 t0: 11180072200 | |
| __m256 t1: 11210153500 | |
| __m256 t2: 12808761600 | |
| __m128 t0: 11527940800 | |
| __m128 t1: 11456309800 | |
| __m128 t2: 12228056400 | |
| __m256 t0: 11213356500 | |
| __m256 t1: 11122218500 | |
| __m256 t2: 12672972000 | |
| __m128 t0: 11465390400 | |
| __m128 t1: 11400557500 | |
| __m128 t2: 12142735300 | |
| __m256 t0: 11193956700 | |
| __m256 t1: 11161492000 | |
| __m256 t2: 12571592800 | |
| __m128 t0: 11673513700 | |
| __m128 t1: 11595063800 | |
| __m128 t2: 12536314300 | |
| __m256 t0: 11356026300 | |
| __m256 t1: 11062994100 | |
| __m256 t2: 12732637900 | |
| __m128 t0: 11385417400 | |
| __m128 t1: 11504470400 | |
| __m128 t2: 12357745200 | |
| __m256 t0: 11152852300 | |
| __m256 t1: 11191574500 | |
| __m256 t2: 12719821800 | |
| __m128 t0: 11637399100 | |
| __m128 t1: 11466271300 | |
| __m128 t2: 12404698100 | |
| __m256 t0: 11206328500 | |
| __m256 t1: 11134623200 | |
| __m256 t2: 12655092900 | |
| __m128 t0: 11574419100 | |
| __m128 t1: 11571126500 | |
| __m128 t2: 12352356500 | |
| __m256 t0: 11269339800 | |
| __m256 t1: 11155889600 | |
| __m256 t2: 12626584700 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| run benchmark 10 times on Intel N150. | |
| __m128 t0: 20315919100 | |
| __m128 t1: 18374150300 | |
| __m128 t2: 18505095300 | |
| __m256 t0: 22141148300 | |
| __m256 t1: 20741017700 | |
| __m256 t2: 21424186400 | |
| __m128 t0: 21880002700 | |
| __m128 t1: 20815994400 | |
| __m128 t2: 20881685000 | |
| __m256 t0: 20148711100 | |
| __m256 t1: 18366306400 | |
| __m256 t2: 18823820000 | |
| __m128 t0: 23833858600 | |
| __m128 t1: 22794601600 | |
| __m128 t2: 22990721500 | |
| __m256 t0: 20314823500 | |
| __m256 t1: 18549870000 | |
| __m256 t2: 19034450300 | |
| __m128 t0: 20304760800 | |
| __m128 t1: 18470911900 | |
| __m128 t2: 18441249600 | |
| __m256 t0: 20457069300 | |
| __m256 t1: 18710773800 | |
| __m256 t2: 19095820300 | |
| __m128 t0: 20187214000 | |
| __m128 t1: 18273702500 | |
| __m128 t2: 18540695400 | |
| __m256 t0: 19955392200 | |
| __m256 t1: 18082994900 | |
| __m256 t2: 18574673000 | |
| __m128 t0: 20276459900 | |
| __m128 t1: 18341866900 | |
| __m128 t2: 18380558100 | |
| __m256 t0: 20110843900 | |
| __m256 t1: 18277752400 | |
| __m256 t2: 18602910200 | |
| __m128 t0: 21311202300 | |
| __m128 t1: 19589505300 | |
| __m128 t2: 19642981700 | |
| __m256 t0: 20987309400 | |
| __m256 t1: 19151178900 | |
| __m256 t2: 19634380700 | |
| __m128 t0: 21192805100 | |
| __m128 t1: 19186187700 | |
| __m128 t2: 19457090900 | |
| __m256 t0: 20663941500 | |
| __m256 t1: 18795906400 | |
| __m256 t2: 19369859400 | |
| __m128 t0: 20352025700 | |
| __m128 t1: 18359221600 | |
| __m128 t2: 18497774400 | |
| __m256 t0: 20098768000 | |
| __m256 t1: 18189933300 | |
| __m256 t2: 18695848400 | |
| __m128 t0: 20326489600 | |
| __m128 t1: 18457392700 | |
| __m128 t2: 18442232100 | |
| __m256 t0: 20040841300 | |
| __m256 t1: 18142117500 | |
| __m256 t2: 18558816100 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Compiled with VisualStudio2026 | |
| run benchmark 10 times on AMD Ryzen7 9700X | |
| __m128 t0: 3225655000 | |
| __m128 t1: 3032108100 | |
| __m128 t2: 3166396500 | |
| __m256 t0: 2747686200 | |
| __m256 t1: 2787105400 | |
| __m256 t2: 4424535200 | |
| __m512 t0: 2707363600 | |
| __m512 t1: 2709231300 | |
| __m512 t2: 3292690600 | |
| __m128 t0: 3161782000 | |
| __m128 t1: 3000353200 | |
| __m128 t2: 3119642900 | |
| __m256 t0: 2713622800 | |
| __m256 t1: 2785357400 | |
| __m256 t2: 4383887900 | |
| __m512 t0: 2696902200 | |
| __m512 t1: 2699328700 | |
| __m512 t2: 3301502000 | |
| __m128 t0: 3152186400 | |
| __m128 t1: 3000290800 | |
| __m128 t2: 3102441300 | |
| __m256 t0: 2722348900 | |
| __m256 t1: 2788805500 | |
| __m256 t2: 4347199100 | |
| __m512 t0: 2696948600 | |
| __m512 t1: 2704444100 | |
| __m512 t2: 3280392700 | |
| __m128 t0: 3169909300 | |
| __m128 t1: 3030758000 | |
| __m128 t2: 3076794100 | |
| __m256 t0: 2757798600 | |
| __m256 t1: 2850816800 | |
| __m256 t2: 4555484300 | |
| __m512 t0: 2713537800 | |
| __m512 t1: 2742288700 | |
| __m512 t2: 3278127600 | |
| __m128 t0: 3174084700 | |
| __m128 t1: 2997331000 | |
| __m128 t2: 3153295200 | |
| __m256 t0: 2734714400 | |
| __m256 t1: 2776162700 | |
| __m256 t2: 4421122800 | |
| __m512 t0: 2709261500 | |
| __m512 t1: 2715914700 | |
| __m512 t2: 3322122800 | |
| __m128 t0: 3165258700 | |
| __m128 t1: 3010237600 | |
| __m128 t2: 3121789400 | |
| __m256 t0: 2720896200 | |
| __m256 t1: 2787953500 | |
| __m256 t2: 4415293300 | |
| __m512 t0: 2702514100 | |
| __m512 t1: 2722176300 | |
| __m512 t2: 3294571400 | |
| __m128 t0: 3175596000 | |
| __m128 t1: 3025558100 | |
| __m128 t2: 3111990400 | |
| __m256 t0: 2733087800 | |
| __m256 t1: 2806578200 | |
| __m256 t2: 4579205400 | |
| __m512 t0: 2708626400 | |
| __m512 t1: 2724508700 | |
| __m512 t2: 3303557000 | |
| __m128 t0: 3173949900 | |
| __m128 t1: 3009483500 | |
| __m128 t2: 3107384300 | |
| __m256 t0: 2744631200 | |
| __m256 t1: 2777569500 | |
| __m256 t2: 4462769800 | |
| __m512 t0: 2712303200 | |
| __m512 t1: 2685437000 | |
| __m512 t2: 3282067900 | |
| __m128 t0: 3170675500 | |
| __m128 t1: 2955565300 | |
| __m128 t2: 3167042200 | |
| __m256 t0: 2799843400 | |
| __m256 t1: 2816240000 | |
| __m256 t2: 4481736300 | |
| __m512 t0: 2702602200 | |
| __m512 t1: 2659314700 | |
| __m512 t2: 3334647600 | |
| __m128 t0: 3149322100 | |
| __m128 t1: 2951201300 | |
| __m128 t2: 3163297700 | |
| __m256 t0: 2709264100 | |
| __m256 t1: 2754949200 | |
| __m256 t2: 4372588700 | |
| __m512 t0: 2690800200 | |
| __m512 t1: 2685653500 | |
| __m512 t2: 3355709800 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment