Last active
April 6, 2020 11:35
-
-
Save madwareru/fd9c8c07f7856a3e1d0c7c6577502799 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use std::ptr; | |
| use criterion::{black_box, criterion_group, criterion_main, Criterion}; | |
| use std::arch::x86_64::*; | |
| fn scale_2x_unsafy_simd(pixels: &[u32], width: u32, height: u32) -> Vec<u32> { | |
| assert_eq!(width % 4, 0); | |
| let whole_size = (width * height * 4) as usize; | |
| let width = (width / 4) as usize; | |
| let dbl_width = (2 * width) as usize; | |
| let mut output = vec![0; whole_size]; | |
| let mut pp = pixels.as_ptr() as *const __m128i; | |
| let pp_out = &mut output[0..whole_size]; | |
| let mut pp_out = pp_out.as_mut_ptr() as *mut __m128i; | |
| unsafe { | |
| for _ in 0..height { | |
| for _ in 0..width { | |
| let bulk_pixel_quadriple = _mm_loadu_si128(pp); | |
| _mm_storeu_si128( | |
| pp_out, | |
| _mm_unpacklo_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple) | |
| ); | |
| pp_out = pp_out.add(1); | |
| _mm_storeu_si128( | |
| pp_out, | |
| _mm_unpackhi_epi32(bulk_pixel_quadriple, bulk_pixel_quadriple) | |
| ); | |
| pp_out = pp_out.add(1); | |
| pp = pp.add(1); | |
| } | |
| ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width); | |
| pp_out = pp_out.add(dbl_width); | |
| } | |
| } | |
| output | |
| } | |
| fn scale_2x_unsafy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> { | |
| let whole_size = (width * height * 4) as usize; | |
| let dbl_width = (width * 2) as usize; | |
| let mut output = vec![0; whole_size]; | |
| let mut pp= pixels.as_ptr(); | |
| let pp_out = &mut output[0..whole_size]; | |
| let mut pp_out = pp_out.as_mut_ptr(); | |
| unsafe { | |
| for _ in 0..height { | |
| for _ in 0..width { | |
| *pp_out = *pp; pp_out = pp_out.add(1); | |
| *pp_out = *pp; pp_out = pp_out.add(1); | |
| pp = pp.add(1); | |
| } | |
| ptr::copy_nonoverlapping(pp_out.sub(dbl_width), pp_out, dbl_width); | |
| pp_out = pp_out.add(dbl_width); | |
| } | |
| } | |
| output | |
| } | |
| fn scale_2x_unsafy_crazy(pixels: &[u32], width: u32, height: u32) -> Vec<u32> { | |
| let whole_size = (width * height * 4) as usize; | |
| let width = width as usize; | |
| let mut output = vec![0; whole_size]; | |
| let mut pp= pixels.as_ptr(); | |
| let pp_out = &mut output[0..whole_size]; | |
| let mut pp_out = pp_out.as_mut_ptr() as *mut u64; | |
| unsafe { | |
| for _ in 0..height { | |
| for _ in 0..width { | |
| *pp_out = *pp as u64 * 0x100000001; | |
| pp_out = pp_out.add(1); | |
| pp = pp.add(1); | |
| } | |
| ptr::copy_nonoverlapping(pp_out.sub(width), pp_out, width); | |
| pp_out = pp_out.add(width); | |
| } | |
| } | |
| output | |
| } | |
| fn scale_2x(pixels: &[u32], width: u32, height: u32) -> Vec<u32> { | |
| let mut pixel_iter = pixels.iter(); | |
| let mut output = vec![0; (width * height * 4) as usize]; | |
| let mut curr_pixel; | |
| let dbl_width = (width * 2) as usize; | |
| let mut offset = 0; | |
| for _ in 0..height { | |
| for _ in 0..width { | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| } | |
| let two_rows = &mut output[offset - dbl_width..offset + dbl_width]; | |
| let (first_row, second_row) = two_rows.split_at_mut(dbl_width); | |
| second_row.copy_from_slice(&first_row); | |
| offset += dbl_width; | |
| } | |
| output | |
| } | |
| fn scale_2x_unwind(pixels: &[u32], width: u32, height: u32) -> Vec<u32> { | |
| let mut pixel_iter = pixels.iter(); | |
| let mut output = vec![0; (width * height * 4) as usize]; | |
| let mut curr_pixel; | |
| let dbl_width = (width * 2) as usize; | |
| let mut offset = 0; | |
| for _ in 0..height { | |
| for _ in (0..dbl_width).step_by(16) { | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| curr_pixel = *pixel_iter.next().unwrap(); | |
| output[offset] = curr_pixel; offset += 1; | |
| output[offset] = curr_pixel; offset += 1; | |
| } | |
| let two_rows = &mut output[offset - dbl_width..offset + dbl_width]; | |
| let (first_row, second_row) = two_rows.split_at_mut(dbl_width); | |
| second_row.copy_from_slice(&first_row); | |
| offset += dbl_width; | |
| } | |
| output | |
| } | |
| const ARR: [u32; 256] = [ | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| 1, 2, 3, 4, 5 ,6, 7, 8, 1, 2, 3, 4, 5 ,6, 7, 8, | |
| ]; | |
| fn bench_scalers(c: &mut Criterion) { | |
| c.bench_function("bench scaler_2x", |b| b.iter(|| { | |
| let scaled = scale_2x(&ARR, 16, 16); | |
| let scaled = scale_2x(&scaled, 32, 32); | |
| let scaled = scale_2x(&scaled, 64, 64); | |
| let scaled = scale_2x(&scaled, 128, 128); | |
| let scaled = scale_2x(&scaled, 256, 256); | |
| let scaled = scale_2x(&scaled, 512, 512); | |
| let scaled = scale_2x(&scaled, 1024, 1024); | |
| let scaled = scale_2x(&scaled, 2048, 2048); | |
| black_box(scaled) | |
| })); | |
| c.bench_function("bench scaler_2x_unwind", |b| b.iter(|| { | |
| let scaled = scale_2x_unwind(&ARR, 16, 16); | |
| let scaled = scale_2x_unwind(&scaled, 32, 32); | |
| let scaled = scale_2x_unwind(&scaled, 64, 64); | |
| let scaled = scale_2x_unwind(&scaled, 128, 128); | |
| let scaled = scale_2x_unwind(&scaled, 256, 256); | |
| let scaled = scale_2x_unwind(&scaled, 512, 512); | |
| let scaled = scale_2x_unwind(&scaled, 1024, 1024); | |
| let scaled = scale_2x_unwind(&scaled, 2048, 2048); | |
| black_box(scaled) | |
| })); | |
| c.bench_function("bench scaler_2x_unsafy", |b| b.iter(|| { | |
| let scaled = scale_2x_unsafy(&ARR, 16, 16); | |
| let scaled = scale_2x_unsafy(&scaled, 32, 32); | |
| let scaled = scale_2x_unsafy(&scaled, 64, 64); | |
| let scaled = scale_2x_unsafy(&scaled, 128, 128); | |
| let scaled = scale_2x_unsafy(&scaled, 256, 256); | |
| let scaled = scale_2x_unsafy(&scaled, 512, 512); | |
| let scaled = scale_2x_unsafy(&scaled, 1024, 1024); | |
| let scaled = scale_2x_unsafy(&scaled, 2048, 2048); | |
| black_box(scaled) | |
| })); | |
| c.bench_function("bench scaler_2x_unsafy_crazy", |b| b.iter(|| { | |
| let scaled = scale_2x_unsafy_crazy(&ARR, 16, 16); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 32, 32); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 64, 64); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 128, 128); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 256, 256); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 512, 512); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 1024, 1024); | |
| let scaled = scale_2x_unsafy_crazy(&scaled, 2048, 2048); | |
| black_box(scaled) | |
| })); | |
| c.bench_function("bench scaler_2x_unsafy_simd", |b| b.iter(|| { | |
| let scaled = scale_2x_unsafy_simd(&ARR, 16, 16); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 32, 32); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 64, 64); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 128, 128); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 256, 256); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 512, 512); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 1024, 1024); | |
| let scaled = scale_2x_unsafy_simd(&scaled, 2048, 2048); | |
| black_box(scaled) | |
| })); | |
| } | |
| criterion_group!(scalers, bench_scalers); | |
| criterion_main!(scalers); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment