Created
December 18, 2025 04:17
-
-
Save fp64/426870696a8493d9c8139070b2ec716d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Public Domain under http://unlicense.org, see link for details. | |
| // Generator of test cases for PSP vdot instruction. | |
| // Version 1.0. | |
| #include <stddef.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| // NOTE: we use bitpatterns (as u32), not floats, throughout. | |
| // LSB of u32 is LSB of float's significand. | |
| typedef uint32_t u32; | |
| //============================================================================== | |
| // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
| // ! The interface to the dot product function. ! | |
| // ! Replace this with an actual implementation ! | |
| // ! invoking PSP vdot instruction. ! | |
| // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
| static u32 vdot(const u32 *x,const u32 *y) | |
| { | |
| // Stub. | |
| (void)x; | |
| (void)y; | |
| return 0; | |
| } | |
| /* | |
| // Example 1. Accessing x86 dpps dot product instruction, GCC inline asm. | |
| static u32 vdot(const u32 *x,const u32 *y) | |
| { | |
| // NOTE: this is over-cautious. | |
| // You may not need __volatile__, | |
| // unless you plan to compute the | |
| // dot product twice and expect | |
| // different results (e.g. you | |
| // change rounding mode in between). | |
| // Also "memory" is likely only | |
| // needed if you don't specify | |
| // the size of inputs via casts. | |
| // See https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Clobbers-and-Scratch-Registers | |
| // for details. | |
| u32 z; | |
| __asm__ __volatile__( | |
| "movups %[x],%%xmm0\n\t" | |
| "movups %[y],%%xmm1\n\t" | |
| "dpps $241,%%xmm1,%%xmm0\n\t" | |
| "movd %%xmm0,%[z]\n\t" | |
| :[z]"=r"(z) | |
| :[x]"m"(*(const u32(*)[4])x) | |
| ,[y]"m"(*(const u32(*)[4])y) | |
| :"xmm0","xmm1","memory" | |
| ); | |
| return z; | |
| } | |
| */ | |
| /* | |
| // Example 2. Accessing x86 dpps dot product instruction, intrinsics. | |
| static u32 vdot(const u32 *x,const u32 *y) | |
| { | |
| float z=_mm_cvtss_f32(_mm_dp_ps( | |
| _mm_loadu_ps((const float *)x), | |
| _mm_loadu_ps((const float *)y),241)); | |
| u32 ret; | |
| memcpy(&ret,&z,sizeof(ret)); | |
| return ret; | |
| } | |
| */ | |
| /* | |
| // Example 3. Scalar dot product. | |
| static u32 vdot(const u32 *x,const u32 *y) | |
| { | |
| float X[4],Y[4],ret; | |
| u32 bits; | |
| memcpy(X,x,sizeof(X)); | |
| memcpy(Y,y,sizeof(Y)); | |
| ret=(X[0]*Y[0]+X[1]*Y[1])+(X[2]*Y[2]+X[3]*Y[3]); // Matches summation order of DPPS. | |
| memcpy(&bits,&ret,sizeof(bits)); | |
| return bits; | |
| } | |
| */ | |
| //============================================================================== | |
| // Emitting test cases. | |
| static const char *filename="vdot.bin"; | |
| static FILE *file=0; | |
| static bool include_input=true,include_output=true; | |
| static void emit(const u32 *x,const u32 *y) | |
| { | |
| u32 z=vdot(x,y); | |
| if(include_input) fwrite(x,1,4*sizeof(u32),file); | |
| if(include_input) fwrite(y,1,4*sizeof(u32),file); | |
| if(include_output) fwrite(&z,1,sizeof(u32),file); | |
| } | |
| // Emit all 24 permutations of the input order. | |
| // The order matches for x and y, there are no | |
| // x<->y swaps. | |
| // The computed result of the dot product may | |
| // or may not depend on it. E.g. x86 dpps | |
| // instruction is specified to be evaluated as | |
| // (x[0]*y[0]+x[1]*y[1])+(x[2]*y[2]+x[3]*y[3]) | |
| static void emit_all_permutations(const u32 *x,const u32 *y) | |
| { | |
| int P[24]={228,180,216,120,156,108,225,177,201, 57,141, 45,210,114,198, 54, 78, 30,147, 99,135, 39, 75, 27}; | |
| for(int i=0;i<24;++i) | |
| { | |
| u32 X[4],Y[4]; | |
| for(int j=0;j<4;++j) X[j]=x[(P[i]>>(2*j))&3]; | |
| for(int j=0;j<4;++j) Y[j]=y[(P[i]>>(2*j))&3]; | |
| emit(X,Y); | |
| } | |
| } | |
| //============================================================================== | |
| // Embedding a message (comment) in the output file. | |
| static void message(const char *text) | |
| { | |
| char buf[7*sizeof(u32)]={0}; | |
| u32 marker=0xFFFFFFFFu; | |
| snprintf(buf,sizeof(buf),text); | |
| if(include_input) fwrite(&marker,1,sizeof(marker),file); | |
| if(include_input) fwrite(buf,1,sizeof(buf),file); | |
| if(include_output) fwrite(&marker,1,sizeof(marker),file); | |
| } | |
| //============================================================================== | |
| // Floating-point values. | |
| static const u32 S=u32(1)<<31,E=u32(255)<<23,M=(u32(1)<<23)-1; | |
| static const u32 D=u32(1)<<23,ONE=127*D; | |
| //============================================================================== | |
| // Random number generation. | |
| // https://gist.github.com/fp64/126289051f0c9e1b70bcba7c3e07a82f | |
| static inline uint32_t u32noise2(uint64_t n) | |
| { | |
| n^=~n>>32; n*=0xAF251AF3B0F025B5ull; | |
| n^=~n>>32; n=(n|1ull)^(n*n); | |
| return (uint32_t)(n^(n>>32)); | |
| } | |
| static uint64_t rng_seed=0; | |
| static inline u32 rnd(void) | |
| { | |
| return u32noise2(rng_seed++); | |
| } | |
| static inline u32 rnd(u32 n) | |
| { | |
| return rnd()%n; | |
| } | |
| static inline u32 rnd(u32 l,u32 h) | |
| { | |
| return l+rnd(h-l); | |
| } | |
| static inline u32 rnd12(void) | |
| { | |
| return ONE+(rnd()&M); | |
| } | |
| //============================================================================== | |
| // Main program. | |
| int main(int argc,char **argv) | |
| { | |
| for(int i=1;i<argc;++i) | |
| { | |
| if(!strcmp(argv[i],"--no-input")) include_input =false; | |
| if(!strcmp(argv[i],"--no-output")) include_output=false; | |
| if(!strncmp(argv[i],"--dump=",7)) filename=argv[i]+7; | |
| } | |
| if(!(file=fopen(filename,"wb"))) return 1; | |
| //============================================================================== | |
| message("PSP vdot testcase generator."); | |
| message("Version 1.0."); | |
| if(true) | |
| { | |
| message("[1;2)"); | |
| for(int k=0;k<1024;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=rnd12(); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("[1;2), perm"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=rnd12(); | |
| emit_all_permutations(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("+- [1;2)"); | |
| for(int k=0;k<1024;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S;} | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("+- [1;2), perm"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S;} | |
| emit_all_permutations(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("+- [1/256;256)"); | |
| for(int k=0;k<4096;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S; t[i]+=D*rnd(-u32(8),+u32(8));} | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("+- [1/256;256), perm"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S; t[i]+=D*rnd(-u32(8),+u32(8));} | |
| emit_all_permutations(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("0*0"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=((k>>i)&1?S:0); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1*0"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=((k>>i)&1?S:0)|(i<4?ONE:0); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("Random"); | |
| for(int k=0;k<4096;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=rnd(); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("Random, perm"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<8;++i) t[i]=rnd(); | |
| emit_all_permutations(t+0,t+4); | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1+x"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={ONE,ONE,0,0,ONE,0,0,0}; | |
| t[5]=rnd12()+2*D; | |
| for(int p=2;p>-32;--p) | |
| { | |
| emit(t+0,t+4); | |
| t[5]-=D; | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1-1+x"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={ONE,ONE,ONE,0,ONE,S|ONE,0,0}; | |
| t[6]=rnd12()+2*D; | |
| for(int p=2;p>-32;--p) | |
| { | |
| emit(t+0,t+4); | |
| t[6]-=D; | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1+x-1+y"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={ONE,ONE,ONE,ONE,ONE,0,S|ONE,0}; | |
| t[5]=rnd12()+2*D; | |
| t[7]=rnd12()+2*D; | |
| t[5]|=(S&rnd()); | |
| t[7]|=(S&rnd()); | |
| for(int p=2;p>-32;--p) | |
| { | |
| emit(t+0,t+4); | |
| t[5]-=D; | |
| t[7]-=D; | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1-(1-x)*(1+x)"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={ONE,0,0,0,ONE,0,0,0}; | |
| for(int p=22;p>=1;--p) | |
| { | |
| u32 x=rnd(u32(1)<<p); | |
| t[1]=ONE+x; | |
| t[5]=S|(ONE-2*x); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1-(1+x)*(1+y)"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={ONE,0,0,0,ONE,0,0,0}; | |
| for(int p=22;p>=1;--p) | |
| { | |
| t[1]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)); | |
| t[5]=S|(ONE+rnd(-(u32(1)<<p),+(u32(1)<<p))); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("A*B-C*D"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int p=20;p>=1;--p) | |
| { | |
| t[0]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)); | |
| t[1]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)); | |
| t[4]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)); | |
| t[5]=S|(ONE+rnd(-(u32(1)<<p),+(u32(1)<<p))); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("2^n*(A*B-C*D)"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| u32 c=4096; | |
| u32 xA=rnd(c); | |
| u32 xB=rnd(c); | |
| u32 xC=rnd(c); | |
| u32 xD=rnd(c); | |
| for(u32 p=0;p<256;++p) | |
| { | |
| u32 e=p*D; | |
| t[0]= e+xA; | |
| t[1]= e+xB; | |
| t[4]= e+xC; | |
| t[5]=S|(e+xD); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("A*B-C*D, low 0s"); | |
| for(int k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| t[0]= rnd12(); | |
| t[1]= rnd12(); | |
| t[4]= rnd12(); | |
| t[5]=S|(rnd12()); | |
| for(int p=1;p<22;++p) | |
| { | |
| t[0]&=(u32(-1)<<p); | |
| t[1]&=(u32(-1)<<p); | |
| t[4]&=(u32(-1)<<p); | |
| t[5]&=(u32(-1)<<p); | |
| emit(t+0,t+4); | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("1+k*2^n"); | |
| for(u32 k=0,p=8;k<(u32(1)<<p);++k) | |
| { | |
| u32 t[8]={ONE,ONE,0,0,ONE,0,0,0}; | |
| t[5]=ONE+(k<<(23-p)); | |
| for(int n=0;n>-32;--n) | |
| { | |
| emit(t+0,t+4); | |
| t[5]-=D; | |
| } | |
| } | |
| } | |
| if(true) | |
| { | |
| message("0/1/inf/nan"); | |
| u32 s[4]={0,ONE,E,E+1}; | |
| for(u32 k=0;k<256;++k) | |
| { | |
| u32 t[8]={0}; | |
| for(int i=0;i<4;++i) t[(i&1)+4*(i>>1)]=s[(k>>(2*i))&3]; | |
| emit(t+0,t+4); | |
| } | |
| } | |
| //============================================================================== | |
| fclose(file); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment