Skip to content

Instantly share code, notes, and snippets.

@madwareru
Last active April 6, 2020 11:35
Show Gist options
  • Select an option

  • Save madwareru/fd9c8c07f7856a3e1d0c7c6577502799 to your computer and use it in GitHub Desktop.

Select an option

Save madwareru/fd9c8c07f7856a3e1d0c7c6577502799 to your computer and use it in GitHub Desktop.
use std::ptr;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::arch::x86_64::*;
fn scale_2x_unsafy_simd(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
assert_eq!(width % 4, 0);
let whole_size = (width * height * 4) as usize;
let width = (width / 4) as usize;
let dbl_width = (2 * width) as usize;
let mut output = vec![0; whole_size];
let mut pp = pixels.as_ptr() as *const __m128i;
let pp_out = &mut output[0..whole_size];
let mut pp_out = pp_out.as_mut_ptr() as *mut __m128i;
unsafe {
for _ in 0..height {
for _ in 0..width {
let bulk_pixel_quadriple = _mm_loadu_si128(pp);
_mm_storeu_si128(
pp_out,
_mm_unpacklo_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
);
pp_out = pp_out.add(1);
_mm_storeu_si128(
pp_out,
_mm_unpackhi_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
);
pp_out = pp_out.add(1);
pp = pp.add(1);
}
ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
pp_out = pp_out.add(dbl_width);
}
}
output
}
fn scale_2x_unsafy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
let whole_size = (width * height * 4) as usize;
let dbl_width = (width * 2) as usize;
let mut output = vec![0; whole_size];
let mut pp= pixels.as_ptr();
let pp_out = &mut output[0..whole_size];
let mut pp_out = pp_out.as_mut_ptr();
unsafe {
for _ in 0..height {
for _ in 0..width {
*pp_out = *pp; pp_out = pp_out.add(1);
*pp_out = *pp; pp_out = pp_out.add(1);
pp = pp.add(1);
}
ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
pp_out = pp_out.add(dbl_width);
}
}
output
}
fn scale_2x_unsafy_crazy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
let whole_size = (width * height * 4) as usize;
let width = width as usize;
let mut output = vec![0; whole_size];
let mut pp= pixels.as_ptr();
let pp_out = &mut output[0..whole_size];
let mut pp_out = pp_out.as_mut_ptr() as *mut u64;
unsafe {
for _ in 0..height {
for _ in 0..width {
*pp_out = *pp as u64 * 0x100000001;
pp_out = pp_out.add(1);
pp = pp.add(1);
}
ptr::copy_nonoverlapping(pp_out.sub(width), pp_out, width);
pp_out = pp_out.add(width);
}
}
output
}
fn scale_2x(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
let mut pixel_iter = pixels.iter();
let mut output = vec![0; (width * height * 4) as usize];
let mut curr_pixel;
let dbl_width = (width * 2) as usize;
let mut offset = 0;
for _ in 0..height {
for _ in 0..width {
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
}
let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
second_row.copy_from_slice(&first_row);
offset += dbl_width;
}
output
}
fn scale_2x_unwind(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
let mut pixel_iter = pixels.iter();
let mut output = vec![0; (width * height * 4) as usize];
let mut curr_pixel;
let dbl_width = (width * 2) as usize;
let mut offset = 0;
for _ in 0..height {
for _ in (0..dbl_width).step_by(16) {
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
curr_pixel = *pixel_iter.next().unwrap();
output[offset] = curr_pixel; offset += 1;
output[offset] = curr_pixel; offset += 1;
}
let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
second_row.copy_from_slice(&first_row);
offset += dbl_width;
}
output
}
const ARR: [u32; 256] = [
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
];
fn bench_scalers(c: &mut Criterion) {
c.bench_function("bench scaler_2x", |b| b.iter(|| {
let scaled = scale_2x(&ARR, 16, 16);
let scaled = scale_2x(&scaled, 32, 32);
let scaled = scale_2x(&scaled, 64, 64);
let scaled = scale_2x(&scaled, 128, 128);
let scaled = scale_2x(&scaled, 256, 256);
let scaled = scale_2x(&scaled, 512, 512);
let scaled = scale_2x(&scaled, 1024, 1024);
let scaled = scale_2x(&scaled, 2048, 2048);
black_box(scaled)
}));
c.bench_function("bench scaler_2x_unwind", |b| b.iter(|| {
let scaled = scale_2x_unwind(&ARR, 16, 16);
let scaled = scale_2x_unwind(&scaled, 32, 32);
let scaled = scale_2x_unwind(&scaled, 64, 64);
let scaled = scale_2x_unwind(&scaled, 128, 128);
let scaled = scale_2x_unwind(&scaled, 256, 256);
let scaled = scale_2x_unwind(&scaled, 512, 512);
let scaled = scale_2x_unwind(&scaled, 1024, 1024);
let scaled = scale_2x_unwind(&scaled, 2048, 2048);
black_box(scaled)
}));
c.bench_function("bench scaler_2x_unsafy", |b| b.iter(|| {
let scaled = scale_2x_unsafy(&ARR, 16, 16);
let scaled = scale_2x_unsafy(&scaled, 32, 32);
let scaled = scale_2x_unsafy(&scaled, 64, 64);
let scaled = scale_2x_unsafy(&scaled, 128, 128);
let scaled = scale_2x_unsafy(&scaled, 256, 256);
let scaled = scale_2x_unsafy(&scaled, 512, 512);
let scaled = scale_2x_unsafy(&scaled, 1024, 1024);
let scaled = scale_2x_unsafy(&scaled, 2048, 2048);
black_box(scaled)
}));
c.bench_function("bench scaler_2x_unsafy_crazy", |b| b.iter(|| {
let scaled = scale_2x_unsafy_crazy(&ARR, 16, 16);
let scaled = scale_2x_unsafy_crazy(&scaled, 32, 32);
let scaled = scale_2x_unsafy_crazy(&scaled, 64, 64);
let scaled = scale_2x_unsafy_crazy(&scaled, 128, 128);
let scaled = scale_2x_unsafy_crazy(&scaled, 256, 256);
let scaled = scale_2x_unsafy_crazy(&scaled, 512, 512);
let scaled = scale_2x_unsafy_crazy(&scaled, 1024, 1024);
let scaled = scale_2x_unsafy_crazy(&scaled, 2048, 2048);
black_box(scaled)
}));
c.bench_function("bench scaler_2x_unsafy_simd", |b| b.iter(|| {
let scaled = scale_2x_unsafy_simd(&ARR, 16, 16);
let scaled = scale_2x_unsafy_simd(&scaled, 32, 32);
let scaled = scale_2x_unsafy_simd(&scaled, 64, 64);
let scaled = scale_2x_unsafy_simd(&scaled, 128, 128);
let scaled = scale_2x_unsafy_simd(&scaled, 256, 256);
let scaled = scale_2x_unsafy_simd(&scaled, 512, 512);
let scaled = scale_2x_unsafy_simd(&scaled, 1024, 1024);
let scaled = scale_2x_unsafy_simd(&scaled, 2048, 2048);
black_box(scaled)
}));
}
criterion_group!(scalers, bench_scalers);
criterion_main!(scalers);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment