madwareru · April 6, 2020 11:35
diff --git a/bench_scalers.rs b/bench_scalers.rs
 use std::ptr;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};

 use std::arch::x86_64::*;

 fn scale_2x_unsafy_simd(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
    assert_eq!(width % 4, 0);
    let whole_size = (width * height * 4) as usize;
    let width = (width / 4) as usize;
    let dbl_width = (2 * width) as usize;

    let mut output = vec![0; whole_size];

    let mut pp = pixels.as_ptr() as *const __m128i;
    let pp_out = &mut output[0..whole_size];
    let mut pp_out = pp_out.as_mut_ptr() as *mut __m128i;

    unsafe {
        for _ in 0..height {
            for _ in 0..width {
                let bulk_pixel_quadriple = _mm_loadu_si128(pp);
                _mm_storeu_si128(
                    pp_out,
                    _mm_unpacklo_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
                );
                pp_out = pp_out.add(1);
                _mm_storeu_si128(
                    pp_out,
                    _mm_unpackhi_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
                );
                pp_out = pp_out.add(1);
                pp = pp.add(1);
            }
            ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
            pp_out = pp_out.add(dbl_width);
        }
    }
    output
 }

 fn scale_2x_unsafy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
    let whole_size = (width * height * 4) as usize;
    let dbl_width = (width * 2) as usize;

    let mut output = vec![0; whole_size];

    let mut pp= pixels.as_ptr();
    let pp_out = &mut output[0..whole_size];
    let mut pp_out = pp_out.as_mut_ptr();

    unsafe {
        for _ in 0..height {
            for _ in 0..width {
                *pp_out = *pp; pp_out = pp_out.add(1);
                *pp_out = *pp; pp_out = pp_out.add(1);
                pp = pp.add(1);
            }
            ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
            pp_out = pp_out.add(dbl_width);
        }
    }
    output
 }

 fn scale_2x_unsafy_crazy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
    let whole_size = (width * height * 4) as usize;
    let width = width as usize;

    let mut output = vec![0; whole_size];

    let mut pp= pixels.as_ptr();
    let pp_out = &mut output[0..whole_size];
    let mut pp_out = pp_out.as_mut_ptr() as *mut u64;

    unsafe {
        for _ in 0..height {
            for _ in 0..width {
                *pp_out = *pp as u64 * 0x100000001;
                pp_out = pp_out.add(1);
                pp = pp.add(1);
            }
            ptr::copy_nonoverlapping(pp_out.sub(width), pp_out, width);
            pp_out = pp_out.add(width);
        }
    }
    output
 }

 fn scale_2x(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {

    let mut pixel_iter = pixels.iter();
    let mut output = vec![0; (width * height * 4) as usize];
    let mut curr_pixel;
    let dbl_width = (width * 2) as usize;
    let mut offset = 0;
    for _ in 0..height {
        for _ in 0..width {
            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;
        }
        let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
        let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
        second_row.copy_from_slice(&first_row);
        offset += dbl_width;
    }
    output
 }

 fn scale_2x_unwind(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
    let mut pixel_iter = pixels.iter();
    let mut output = vec![0; (width * height * 4) as usize];
    let mut curr_pixel;
    let dbl_width = (width * 2) as usize;
    let mut offset = 0;
    for _ in 0..height {
        for _ in (0..dbl_width).step_by(16) {
            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;

            curr_pixel = *pixel_iter.next().unwrap();
            output[offset] = curr_pixel; offset += 1;
            output[offset] = curr_pixel; offset += 1;
        }
        let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
        let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
        second_row.copy_from_slice(&first_row);
        offset += dbl_width;
    }
    output
 }

 const ARR: [u32; 256] = [
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
    1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
 ];

 fn bench_scalers(c: &mut Criterion) {
    c.bench_function("bench scaler_2x", |b| b.iter(|| {
        let scaled = scale_2x(&ARR, 16, 16);
        let scaled = scale_2x(&scaled, 32, 32);
        let scaled = scale_2x(&scaled, 64, 64);
        let scaled = scale_2x(&scaled, 128, 128);
        let scaled = scale_2x(&scaled, 256, 256);
        let scaled = scale_2x(&scaled, 512, 512);
        let scaled = scale_2x(&scaled, 1024, 1024);
        let scaled = scale_2x(&scaled, 2048, 2048);
        black_box(scaled)
    }));

    c.bench_function("bench scaler_2x_unwind", |b| b.iter(|| {
        let scaled = scale_2x_unwind(&ARR, 16, 16);
        let scaled = scale_2x_unwind(&scaled, 32, 32);
        let scaled = scale_2x_unwind(&scaled, 64, 64);
        let scaled = scale_2x_unwind(&scaled, 128, 128);
        let scaled = scale_2x_unwind(&scaled, 256, 256);
        let scaled = scale_2x_unwind(&scaled, 512, 512);
        let scaled = scale_2x_unwind(&scaled, 1024, 1024);
        let scaled = scale_2x_unwind(&scaled, 2048, 2048);
        black_box(scaled)
    }));

    c.bench_function("bench scaler_2x_unsafy", |b| b.iter(|| {
        let scaled = scale_2x_unsafy(&ARR, 16, 16);
        let scaled = scale_2x_unsafy(&scaled, 32, 32);
        let scaled = scale_2x_unsafy(&scaled, 64, 64);
        let scaled = scale_2x_unsafy(&scaled, 128, 128);
        let scaled = scale_2x_unsafy(&scaled, 256, 256);
        let scaled = scale_2x_unsafy(&scaled, 512, 512);
        let scaled = scale_2x_unsafy(&scaled, 1024, 1024);
        let scaled = scale_2x_unsafy(&scaled, 2048, 2048);
        black_box(scaled)
    }));

    c.bench_function("bench scaler_2x_unsafy_crazy", |b| b.iter(|| {
        let scaled = scale_2x_unsafy_crazy(&ARR, 16, 16);
        let scaled = scale_2x_unsafy_crazy(&scaled, 32, 32);
        let scaled = scale_2x_unsafy_crazy(&scaled, 64, 64);
        let scaled = scale_2x_unsafy_crazy(&scaled, 128, 128);
        let scaled = scale_2x_unsafy_crazy(&scaled, 256, 256);
        let scaled = scale_2x_unsafy_crazy(&scaled, 512, 512);
        let scaled = scale_2x_unsafy_crazy(&scaled, 1024, 1024);
        let scaled = scale_2x_unsafy_crazy(&scaled, 2048, 2048);
        black_box(scaled)
    }));

    c.bench_function("bench scaler_2x_unsafy_simd", |b| b.iter(|| {
        let scaled = scale_2x_unsafy_simd(&ARR, 16, 16);
        let scaled = scale_2x_unsafy_simd(&scaled, 32, 32);
        let scaled = scale_2x_unsafy_simd(&scaled, 64, 64);
        let scaled = scale_2x_unsafy_simd(&scaled, 128, 128);
        let scaled = scale_2x_unsafy_simd(&scaled, 256, 256);
        let scaled = scale_2x_unsafy_simd(&scaled, 512, 512);
        let scaled = scale_2x_unsafy_simd(&scaled, 1024, 1024);
        let scaled = scale_2x_unsafy_simd(&scaled, 2048, 2048);
        black_box(scaled)
    }));
 }

 criterion_group!(scalers, bench_scalers);
 criterion_main!(scalers);
	use std::ptr;
	use criterion::{black_box, criterion_group, criterion_main, Criterion};

	use std::arch::x86_64::*;

	fn scale_2x_unsafy_simd(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
	assert_eq!(width % 4, 0);
	let whole_size = (width * height * 4) as usize;
	let width = (width / 4) as usize;
	let dbl_width = (2 * width) as usize;

	let mut output = vec![0; whole_size];

	let mut pp = pixels.as_ptr() as *const __m128i;
	let pp_out = &mut output[0..whole_size];
	let mut pp_out = pp_out.as_mut_ptr() as *mut __m128i;

	unsafe {
	for _ in 0..height {
	for _ in 0..width {
	let bulk_pixel_quadriple = _mm_loadu_si128(pp);
	_mm_storeu_si128(
	pp_out,
	_mm_unpacklo_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
	);
	pp_out = pp_out.add(1);
	_mm_storeu_si128(
	pp_out,
	_mm_unpackhi_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple)
	);
	pp_out = pp_out.add(1);
	pp = pp.add(1);
	}
	ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
	pp_out = pp_out.add(dbl_width);
	}
	}
	output
	}

	fn scale_2x_unsafy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
	let whole_size = (width * height * 4) as usize;
	let dbl_width = (width * 2) as usize;

	let mut output = vec![0; whole_size];

	let mut pp= pixels.as_ptr();
	let pp_out = &mut output[0..whole_size];
	let mut pp_out = pp_out.as_mut_ptr();

	unsafe {
	for _ in 0..height {
	for _ in 0..width {
	pp_out = pp; pp_out = pp_out.add(1);
	pp_out = pp; pp_out = pp_out.add(1);
	pp = pp.add(1);
	}
	ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width);
	pp_out = pp_out.add(dbl_width);
	}
	}
	output
	}

	fn scale_2x_unsafy_crazy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
	let whole_size = (width * height * 4) as usize;
	let width = width as usize;

	let mut output = vec![0; whole_size];

	let mut pp= pixels.as_ptr();
	let pp_out = &mut output[0..whole_size];
	let mut pp_out = pp_out.as_mut_ptr() as *mut u64;

	unsafe {
	for _ in 0..height {
	for _ in 0..width {
	pp_out = pp as u64 * 0x100000001;
	pp_out = pp_out.add(1);
	pp = pp.add(1);
	}
	ptr::copy_nonoverlapping(pp_out.sub(width), pp_out, width);
	pp_out = pp_out.add(width);
	}
	}
	output
	}

	fn scale_2x(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {

	let mut pixel_iter = pixels.iter();
	let mut output = vec![0; (width * height * 4) as usize];
	let mut curr_pixel;
	let dbl_width = (width * 2) as usize;
	let mut offset = 0;
	for _ in 0..height {
	for _ in 0..width {
	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;
	}
	let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
	let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
	second_row.copy_from_slice(&first_row);
	offset += dbl_width;
	}
	output
	}

	fn scale_2x_unwind(pixels: &[u32], width: u32, height: u32) -> Vec<u32> {
	let mut pixel_iter = pixels.iter();
	let mut output = vec![0; (width * height * 4) as usize];
	let mut curr_pixel;
	let dbl_width = (width * 2) as usize;
	let mut offset = 0;
	for _ in 0..height {
	for _ in (0..dbl_width).step_by(16) {
	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;

	curr_pixel = *pixel_iter.next().unwrap();
	output[offset] = curr_pixel; offset += 1;
	output[offset] = curr_pixel; offset += 1;
	}
	let two_rows = &mut output[offset - dbl_width..offset + dbl_width];
	let (first_row, second_row) = two_rows.split_at_mut(dbl_width);
	second_row.copy_from_slice(&first_row);
	offset += dbl_width;
	}
	output
	}

	const ARR: [u32; 256] = [
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8,
	];

	fn bench_scalers(c: &mut Criterion) {
	c.bench_function("bench scaler_2x", \|b\| b.iter(\|\| {
	let scaled = scale_2x(&ARR, 16, 16);
	let scaled = scale_2x(&scaled, 32, 32);
	let scaled = scale_2x(&scaled, 64, 64);
	let scaled = scale_2x(&scaled, 128, 128);
	let scaled = scale_2x(&scaled, 256, 256);
	let scaled = scale_2x(&scaled, 512, 512);
	let scaled = scale_2x(&scaled, 1024, 1024);
	let scaled = scale_2x(&scaled, 2048, 2048);
	black_box(scaled)
	}));

	c.bench_function("bench scaler_2x_unwind", \|b\| b.iter(\|\| {
	let scaled = scale_2x_unwind(&ARR, 16, 16);
	let scaled = scale_2x_unwind(&scaled, 32, 32);
	let scaled = scale_2x_unwind(&scaled, 64, 64);
	let scaled = scale_2x_unwind(&scaled, 128, 128);
	let scaled = scale_2x_unwind(&scaled, 256, 256);
	let scaled = scale_2x_unwind(&scaled, 512, 512);
	let scaled = scale_2x_unwind(&scaled, 1024, 1024);
	let scaled = scale_2x_unwind(&scaled, 2048, 2048);
	black_box(scaled)
	}));

	c.bench_function("bench scaler_2x_unsafy", \|b\| b.iter(\|\| {
	let scaled = scale_2x_unsafy(&ARR, 16, 16);
	let scaled = scale_2x_unsafy(&scaled, 32, 32);
	let scaled = scale_2x_unsafy(&scaled, 64, 64);
	let scaled = scale_2x_unsafy(&scaled, 128, 128);
	let scaled = scale_2x_unsafy(&scaled, 256, 256);
	let scaled = scale_2x_unsafy(&scaled, 512, 512);
	let scaled = scale_2x_unsafy(&scaled, 1024, 1024);
	let scaled = scale_2x_unsafy(&scaled, 2048, 2048);
	black_box(scaled)
	}));

	c.bench_function("bench scaler_2x_unsafy_crazy", \|b\| b.iter(\|\| {
	let scaled = scale_2x_unsafy_crazy(&ARR, 16, 16);
	let scaled = scale_2x_unsafy_crazy(&scaled, 32, 32);
	let scaled = scale_2x_unsafy_crazy(&scaled, 64, 64);
	let scaled = scale_2x_unsafy_crazy(&scaled, 128, 128);
	let scaled = scale_2x_unsafy_crazy(&scaled, 256, 256);
	let scaled = scale_2x_unsafy_crazy(&scaled, 512, 512);
	let scaled = scale_2x_unsafy_crazy(&scaled, 1024, 1024);
	let scaled = scale_2x_unsafy_crazy(&scaled, 2048, 2048);
	black_box(scaled)
	}));

	c.bench_function("bench scaler_2x_unsafy_simd", \|b\| b.iter(\|\| {
	let scaled = scale_2x_unsafy_simd(&ARR, 16, 16);
	let scaled = scale_2x_unsafy_simd(&scaled, 32, 32);
	let scaled = scale_2x_unsafy_simd(&scaled, 64, 64);
	let scaled = scale_2x_unsafy_simd(&scaled, 128, 128);
	let scaled = scale_2x_unsafy_simd(&scaled, 256, 256);
	let scaled = scale_2x_unsafy_simd(&scaled, 512, 512);
	let scaled = scale_2x_unsafy_simd(&scaled, 1024, 1024);
	let scaled = scale_2x_unsafy_simd(&scaled, 2048, 2048);
	black_box(scaled)
	}));
	}

	criterion_group!(scalers, bench_scalers);
	criterion_main!(scalers);
No results found