Skip to content

Instantly share code, notes, and snippets.

@chikuzen
Last active December 28, 2025 13:49
Show Gist options
  • Select an option

  • Save chikuzen/84366106f0a84a981d50355315a45c47 to your computer and use it in GitHub Desktop.

Select an option

Save chikuzen/84366106f0a84a981d50355315a45c47 to your computer and use it in GitHub Desktop.
除算と逆数の乗算の速度比較
#include <cstdio>
#include <cstdint>
#include <cstring>
#include <chrono>
#include <type_traits>
#include <immintrin.h>
using namespace std::chrono;
#ifndef _WIN32
#define __forceinline inline __attribute__((always_inline))
#endif
static void edge_detection(float* srcp, float* gx, float* gy, size_t stride, size_t height)
{
memset(gx, 0, stride * sizeof(float));
memset(gy, 0, stride * sizeof(float));
for (size_t y = 1; y < height - 1; ++y) {
gx += stride;
gy += stride;
srcp += stride;
gx[0] = 0;
gy[0] = 0;
for (size_t x = 1; x < stride - 1; ++x) {
gx[x] = srcp[x - 1] - srcp[x + 1];
gy[x] = srcp[x - stride] - srcp[x + stride];
}
gx[stride - 1] = 0;
gy[stride - 1] = 0;
}
memset(gx + stride, 0, stride * sizeof(float));
memset(gy + stride, 0, stride * sizeof(float));
}
template <typename T>
static __forceinline T load(const float* ptr)
{
if constexpr (std::is_same_v< T, __m128 >) {
return _mm_load_ps(ptr);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v< T, __m256 >) {
return _mm256_load_ps(ptr);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v< T, __m512 >) {
return _mm512_load_ps(ptr);
}
#endif
#endif
}
template <typename T>
static __forceinline T mul(const T& a, const T& b)
{
if constexpr (std::is_same_v<T, __m128>) {
return _mm_mul_ps(a, b);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
return _mm256_mul_ps(a, b);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
return _mm512_mul_ps(a, b);
}
#endif
#endif
}
template <typename T>
static __forceinline T div(const T& a, const T& b)
{
if constexpr (std::is_same_v<T, __m128>) {
return _mm_div_ps(a, b);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
return _mm256_div_ps(a, b);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
return _mm512_div_ps(a, b);
}
#endif
#endif
}
template <typename T>
static __forceinline T add(const T& a, const T& b)
{
if constexpr (std::is_same_v<T, __m128>) {
return _mm_add_ps(a, b);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
return _mm256_add_ps(a, b);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
return _mm512_add_ps(a, b);
}
#endif
#endif
}
template <typename T>
static __forceinline T sub(const T& a, const T& b)
{
if constexpr (std::is_same_v<T, __m128>) {
return _mm_sub_ps(a, b);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
return _mm256_sub_ps(a, b);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
return _mm512_sub_ps(a, b);
}
#endif
#endif
}
template <typename T>
static __forceinline T _rcp(const T& a)
{
if constexpr (std::is_same_v<T, __m128>) {
return _mm_rcp_ps(a);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
return _mm256_rcp_ps(a);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
return _mm512_rcp14_ps(a);
}
#endif
#endif
}
template <typename T>
static __forceinline void store(float* ptr, const T& v)
{
if constexpr (std::is_same_v<T, __m128>) {
_mm_store_ps(ptr, v);
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
_mm256_store_ps(ptr, v);
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
_mm512_store_ps(ptr, v);
}
#endif
#endif
}
template <typename T>
int64_t calc_tan0(const float* gx, const float* gy, float* dstp, size_t stride, size_t height)
{
auto start = system_clock::now();
for (size_t y = 1; y < height - 1; ++y) {
gx += stride;
gy += stride;
dstp += stride;
for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) {
T vx = load<T>(gx + x);
T vy = load<T>(gy + x);
store<T>(dstp + x, div<T>(vx, vy));
}
}
auto end = system_clock::now();
return duration_cast<nanoseconds>(end - start).count();
}
template <typename T>
int64_t calc_tan1(const float* gx, const float* gy, float* dstp, size_t stride, size_t height)
{
auto start = system_clock::now();
for (size_t y = 1; y < height - 1; ++y) {
gx += stride;
gy += stride;
dstp += stride;
for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) {
T vx = load<T>(gx + x);
T vy = load<T>(gy + x);
T rcp = _rcp<T>(vx);
store<T>(dstp + x, mul<T>(rcp, vy));
}
}
auto end = system_clock::now();
return duration_cast<nanoseconds>(end - start).count();
}
template <typename T>
static __forceinline T rcphq(const T& a)
{
if constexpr (std::is_same_v<T, __m128>) {
__m128 rcp = _rcp(a);
return sub(add(rcp, rcp), mul(a, mul(rcp, rcp)));
}
#ifdef __AVX2__
else if constexpr (std::is_same_v<T, __m256>) {
static const __m256 two = _mm256_set1_ps(2.0f);
__m256 rcp = _rcp(a);
return mul(rcp, _mm256_fnmadd_ps(a, rcp, two));
}
#ifdef __AVX512F__
else if constexpr (std::is_same_v<T, __m512>) {
static const __m512 two = _mm512_set1_ps(2.0f);
__m512 rcp = _rcp(a);
return mul(rcp, _mm512_fnmadd_ps(a, rcp, two));
}
#endif
#endif
}
template <typename T>
int64_t calc_tan2(const float* gx, const float* gy, float* dstp, size_t stride, size_t height)
{
auto start = system_clock::now();
for (size_t y = 1; y < height - 1; ++y) {
gx += stride;
gy += stride;
dstp += stride;
for (size_t x = 0; x < stride; x += sizeof(T) / sizeof(float)) {
T vx = load<T>(gx + x);
T vy = load<T>(gy + x);
T rcp = rcphq(vx);
store<T>(dstp + x, mul<T>(rcp, vy));
}
}
auto end = system_clock::now();
return duration_cast<nanoseconds>(end - start).count();
}
int main(void)
{
constexpr size_t stride = 1920;
constexpr size_t height = 1080;
constexpr size_t framesize = stride * height;
void* buff = _mm_malloc(framesize * sizeof(float) * 6, 64);
if (!buff) exit(1);
float* srcp = reinterpret_cast<float*>(buff);
float* gx = srcp + framesize;
float* gy = gx + framesize;
float* tan0 = gy + framesize;
float* tan1 = tan0 + framesize;
float* tan2 = tan1 + framesize;
FILE* fp = fopen("./cr_float_1920x1080.raw", "rb");
if (!fp) exit(1);
fread(srcp, sizeof(float), framesize, fp);
fclose(fp);
edge_detection(srcp, gx, gy, stride, height);
int64_t t0 = 0, t1 = 0, t2 = 0;
for (int i = 0; i < 10000; ++i) {
t0 += calc_tan0<__m128>(gx, gy, tan0, stride, height);
t1 += calc_tan1<__m128>(gx, gy, tan1, stride, height);
t2 += calc_tan2<__m128>(gx, gy, tan2, stride, height);
}
puts(std::format("__m128 t0: {}", t0).c_str());
puts(std::format("__m128 t1: {}", t1).c_str());
puts(std::format("__m128 t2: {}", t2).c_str());
puts("");
#ifdef __AVX2__
t0 = 0, t1 = 0, t2 = 0;
for (int i = 0; i < 10000; ++i) {
t0 += calc_tan0<__m256>(gx, gy, tan0, stride, height);
t1 += calc_tan1<__m256>(gx, gy, tan1, stride, height);
t2 += calc_tan2<__m256>(gx, gy, tan2, stride, height);
}
puts(std::format("__m256 t0: {}", t0).c_str());
puts(std::format("__m256 t1: {}", t1).c_str());
puts(std::format("__m256 t2: {}", t2).c_str());
puts("");
#endif
#ifdef __AVX512F__
t0 = 0, t1 = 0, t2 = 0;
for (int i = 0; i < 10000; ++i) {
t0 += calc_tan0<__m512>(gx, gy, tan0, stride, height);
t1 += calc_tan1<__m512>(gx, gy, tan1, stride, height);
t2 += calc_tan2<__m512>(gx, gy, tan2, stride, height);
}
puts(std::format("__m512 t0: {}", t0).c_str());
puts(std::format("__m512 t1: {}", t1).c_str());
puts(std::format("__m512 t2: {}", t2).c_str());
puts("");
#endif
_mm_free(buff);
return 0;
}
run benchmark 10 times on Intel Core i7 4770
__m128 t0: 16141603900
__m128 t1: 16006334100
__m128 t2: 16458018300
__m256 t0: 16232862700
__m256 t1: 16341678800
__m256 t2: 17506229300
__m128 t0: 16128331900
__m128 t1: 16012951800
__m128 t2: 16281443300
__m256 t0: 16234015800
__m256 t1: 16360998000
__m256 t2: 17222480900
__m128 t0: 16320674000
__m128 t1: 16245989400
__m128 t2: 16469393100
__m256 t0: 16417865900
__m256 t1: 16526099500
__m256 t2: 17385912800
__m128 t0: 16176071800
__m128 t1: 16158048900
__m128 t2: 16389021200
__m256 t0: 16253484600
__m256 t1: 16464722100
__m256 t2: 17305499100
__m128 t0: 16152353600
__m128 t1: 16014629700
__m128 t2: 16377454700
__m256 t0: 16249665800
__m256 t1: 16344594300
__m256 t2: 17280650800
__m128 t0: 16141269300
__m128 t1: 16135545600
__m128 t2: 16494164700
__m256 t0: 16131817300
__m256 t1: 16393160300
__m256 t2: 17194029200
__m128 t0: 16089997000
__m128 t1: 16033717000
__m128 t2: 16364618000
__m256 t0: 16179000900
__m256 t1: 16365154100
__m256 t2: 17253009000
__m128 t0: 16179979000
__m128 t1: 16052390100
__m128 t2: 16445640500
__m256 t0: 16222803600
__m256 t1: 16326864500
__m256 t2: 17286472200
__m128 t0: 16177169600
__m128 t1: 15996007000
__m128 t2: 16322758500
__m256 t0: 16278922500
__m256 t1: 16345059400
__m256 t2: 17250139300
__m128 t0: 16157409800
__m128 t1: 16038530800
__m128 t2: 16390106900
__m256 t0: 16559159400
__m256 t1: 16714210000
__m256 t2: 17722736200
run benchmark on Intel Core-i7 7700HQ (laptop)
__m128 t0: 11492445700
__m128 t1: 11511440100
__m128 t2: 12243317700
__m256 t0: 11233625700
__m256 t1: 11075939700
__m256 t2: 12691638000
__m128 t0: 11489163200
__m128 t1: 11522478200
__m128 t2: 12288589600
__m256 t0: 11188765500
__m256 t1: 11132576900
__m256 t2: 12756802500
__m128 t0: 11872450200
__m128 t1: 11932667000
__m128 t2: 12679231200
__m256 t0: 11225789800
__m256 t1: 11152940300
__m256 t2: 12715785800
__m128 t0: 11405799700
__m128 t1: 11718692900
__m128 t2: 12216355900
__m256 t0: 11180072200
__m256 t1: 11210153500
__m256 t2: 12808761600
__m128 t0: 11527940800
__m128 t1: 11456309800
__m128 t2: 12228056400
__m256 t0: 11213356500
__m256 t1: 11122218500
__m256 t2: 12672972000
__m128 t0: 11465390400
__m128 t1: 11400557500
__m128 t2: 12142735300
__m256 t0: 11193956700
__m256 t1: 11161492000
__m256 t2: 12571592800
__m128 t0: 11673513700
__m128 t1: 11595063800
__m128 t2: 12536314300
__m256 t0: 11356026300
__m256 t1: 11062994100
__m256 t2: 12732637900
__m128 t0: 11385417400
__m128 t1: 11504470400
__m128 t2: 12357745200
__m256 t0: 11152852300
__m256 t1: 11191574500
__m256 t2: 12719821800
__m128 t0: 11637399100
__m128 t1: 11466271300
__m128 t2: 12404698100
__m256 t0: 11206328500
__m256 t1: 11134623200
__m256 t2: 12655092900
__m128 t0: 11574419100
__m128 t1: 11571126500
__m128 t2: 12352356500
__m256 t0: 11269339800
__m256 t1: 11155889600
__m256 t2: 12626584700
run benchmark 10 times on Intel N150.
__m128 t0: 20315919100
__m128 t1: 18374150300
__m128 t2: 18505095300
__m256 t0: 22141148300
__m256 t1: 20741017700
__m256 t2: 21424186400
__m128 t0: 21880002700
__m128 t1: 20815994400
__m128 t2: 20881685000
__m256 t0: 20148711100
__m256 t1: 18366306400
__m256 t2: 18823820000
__m128 t0: 23833858600
__m128 t1: 22794601600
__m128 t2: 22990721500
__m256 t0: 20314823500
__m256 t1: 18549870000
__m256 t2: 19034450300
__m128 t0: 20304760800
__m128 t1: 18470911900
__m128 t2: 18441249600
__m256 t0: 20457069300
__m256 t1: 18710773800
__m256 t2: 19095820300
__m128 t0: 20187214000
__m128 t1: 18273702500
__m128 t2: 18540695400
__m256 t0: 19955392200
__m256 t1: 18082994900
__m256 t2: 18574673000
__m128 t0: 20276459900
__m128 t1: 18341866900
__m128 t2: 18380558100
__m256 t0: 20110843900
__m256 t1: 18277752400
__m256 t2: 18602910200
__m128 t0: 21311202300
__m128 t1: 19589505300
__m128 t2: 19642981700
__m256 t0: 20987309400
__m256 t1: 19151178900
__m256 t2: 19634380700
__m128 t0: 21192805100
__m128 t1: 19186187700
__m128 t2: 19457090900
__m256 t0: 20663941500
__m256 t1: 18795906400
__m256 t2: 19369859400
__m128 t0: 20352025700
__m128 t1: 18359221600
__m128 t2: 18497774400
__m256 t0: 20098768000
__m256 t1: 18189933300
__m256 t2: 18695848400
__m128 t0: 20326489600
__m128 t1: 18457392700
__m128 t2: 18442232100
__m256 t0: 20040841300
__m256 t1: 18142117500
__m256 t2: 18558816100
Compiled with VisualStudio2026
run benchmark 10 times on AMD Ryzen7 9700X
__m128 t0: 3225655000
__m128 t1: 3032108100
__m128 t2: 3166396500
__m256 t0: 2747686200
__m256 t1: 2787105400
__m256 t2: 4424535200
__m512 t0: 2707363600
__m512 t1: 2709231300
__m512 t2: 3292690600
__m128 t0: 3161782000
__m128 t1: 3000353200
__m128 t2: 3119642900
__m256 t0: 2713622800
__m256 t1: 2785357400
__m256 t2: 4383887900
__m512 t0: 2696902200
__m512 t1: 2699328700
__m512 t2: 3301502000
__m128 t0: 3152186400
__m128 t1: 3000290800
__m128 t2: 3102441300
__m256 t0: 2722348900
__m256 t1: 2788805500
__m256 t2: 4347199100
__m512 t0: 2696948600
__m512 t1: 2704444100
__m512 t2: 3280392700
__m128 t0: 3169909300
__m128 t1: 3030758000
__m128 t2: 3076794100
__m256 t0: 2757798600
__m256 t1: 2850816800
__m256 t2: 4555484300
__m512 t0: 2713537800
__m512 t1: 2742288700
__m512 t2: 3278127600
__m128 t0: 3174084700
__m128 t1: 2997331000
__m128 t2: 3153295200
__m256 t0: 2734714400
__m256 t1: 2776162700
__m256 t2: 4421122800
__m512 t0: 2709261500
__m512 t1: 2715914700
__m512 t2: 3322122800
__m128 t0: 3165258700
__m128 t1: 3010237600
__m128 t2: 3121789400
__m256 t0: 2720896200
__m256 t1: 2787953500
__m256 t2: 4415293300
__m512 t0: 2702514100
__m512 t1: 2722176300
__m512 t2: 3294571400
__m128 t0: 3175596000
__m128 t1: 3025558100
__m128 t2: 3111990400
__m256 t0: 2733087800
__m256 t1: 2806578200
__m256 t2: 4579205400
__m512 t0: 2708626400
__m512 t1: 2724508700
__m512 t2: 3303557000
__m128 t0: 3173949900
__m128 t1: 3009483500
__m128 t2: 3107384300
__m256 t0: 2744631200
__m256 t1: 2777569500
__m256 t2: 4462769800
__m512 t0: 2712303200
__m512 t1: 2685437000
__m512 t2: 3282067900
__m128 t0: 3170675500
__m128 t1: 2955565300
__m128 t2: 3167042200
__m256 t0: 2799843400
__m256 t1: 2816240000
__m256 t2: 4481736300
__m512 t0: 2702602200
__m512 t1: 2659314700
__m512 t2: 3334647600
__m128 t0: 3149322100
__m128 t1: 2951201300
__m128 t2: 3163297700
__m256 t0: 2709264100
__m256 t1: 2754949200
__m256 t2: 4372588700
__m512 t0: 2690800200
__m512 t1: 2685653500
__m512 t2: 3355709800
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment