Skip to content

Instantly share code, notes, and snippets.

@fp64
Created December 18, 2025 04:17
Show Gist options
  • Select an option

  • Save fp64/426870696a8493d9c8139070b2ec716d to your computer and use it in GitHub Desktop.

Select an option

Save fp64/426870696a8493d9c8139070b2ec716d to your computer and use it in GitHub Desktop.
// Public Domain under http://unlicense.org, see link for details.
// Generator of test cases for PSP vdot instruction.
// Version 1.0.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
// NOTE: we use bitpatterns (as u32), not floats, throughout.
// LSB of u32 is LSB of float's significand.
typedef uint32_t u32;
//==============================================================================
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// ! The interface to the dot product function. !
// ! Replace this with an actual implementation !
// ! invoking PSP vdot instruction. !
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
static u32 vdot(const u32 *x,const u32 *y)
{
// Stub.
(void)x;
(void)y;
return 0;
}
/*
// Example 1. Accessing x86 dpps dot product instruction, GCC inline asm.
static u32 vdot(const u32 *x,const u32 *y)
{
// NOTE: this is over-cautious.
// You may not need __volatile__,
// unless you plan to compute the
// dot product twice and expect
// different results (e.g. you
// change rounding mode in between).
// Also "memory" is likely only
// needed if you don't specify
// the size of inputs via casts.
// See https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Clobbers-and-Scratch-Registers
// for details.
u32 z;
__asm__ __volatile__(
"movups %[x],%%xmm0\n\t"
"movups %[y],%%xmm1\n\t"
"dpps $241,%%xmm1,%%xmm0\n\t"
"movd %%xmm0,%[z]\n\t"
:[z]"=r"(z)
:[x]"m"(*(const u32(*)[4])x)
,[y]"m"(*(const u32(*)[4])y)
:"xmm0","xmm1","memory"
);
return z;
}
*/
/*
// Example 2. Accessing x86 dpps dot product instruction, intrinsics.
static u32 vdot(const u32 *x,const u32 *y)
{
float z=_mm_cvtss_f32(_mm_dp_ps(
_mm_loadu_ps((const float *)x),
_mm_loadu_ps((const float *)y),241));
u32 ret;
memcpy(&ret,&z,sizeof(ret));
return ret;
}
*/
/*
// Example 3. Scalar dot product.
static u32 vdot(const u32 *x,const u32 *y)
{
float X[4],Y[4],ret;
u32 bits;
memcpy(X,x,sizeof(X));
memcpy(Y,y,sizeof(Y));
ret=(X[0]*Y[0]+X[1]*Y[1])+(X[2]*Y[2]+X[3]*Y[3]); // Matches summation order of DPPS.
memcpy(&bits,&ret,sizeof(bits));
return bits;
}
*/
//==============================================================================
// Emitting test cases.
static const char *filename="vdot.bin";
static FILE *file=0;
static bool include_input=true,include_output=true;
static void emit(const u32 *x,const u32 *y)
{
u32 z=vdot(x,y);
if(include_input) fwrite(x,1,4*sizeof(u32),file);
if(include_input) fwrite(y,1,4*sizeof(u32),file);
if(include_output) fwrite(&z,1,sizeof(u32),file);
}
// Emit all 24 permutations of the input order.
// The order matches for x and y, there are no
// x<->y swaps.
// The computed result of the dot product may
// or may not depend on it. E.g. x86 dpps
// instruction is specified to be evaluated as
// (x[0]*y[0]+x[1]*y[1])+(x[2]*y[2]+x[3]*y[3])
static void emit_all_permutations(const u32 *x,const u32 *y)
{
int P[24]={228,180,216,120,156,108,225,177,201, 57,141, 45,210,114,198, 54, 78, 30,147, 99,135, 39, 75, 27};
for(int i=0;i<24;++i)
{
u32 X[4],Y[4];
for(int j=0;j<4;++j) X[j]=x[(P[i]>>(2*j))&3];
for(int j=0;j<4;++j) Y[j]=y[(P[i]>>(2*j))&3];
emit(X,Y);
}
}
//==============================================================================
// Embedding a message (comment) in the output file.
static void message(const char *text)
{
char buf[7*sizeof(u32)]={0};
u32 marker=0xFFFFFFFFu;
snprintf(buf,sizeof(buf),text);
if(include_input) fwrite(&marker,1,sizeof(marker),file);
if(include_input) fwrite(buf,1,sizeof(buf),file);
if(include_output) fwrite(&marker,1,sizeof(marker),file);
}
//==============================================================================
// Floating-point values.
static const u32 S=u32(1)<<31,E=u32(255)<<23,M=(u32(1)<<23)-1;
static const u32 D=u32(1)<<23,ONE=127*D;
//==============================================================================
// Random number generation.
// https://gist.github.com/fp64/126289051f0c9e1b70bcba7c3e07a82f
static inline uint32_t u32noise2(uint64_t n)
{
n^=~n>>32; n*=0xAF251AF3B0F025B5ull;
n^=~n>>32; n=(n|1ull)^(n*n);
return (uint32_t)(n^(n>>32));
}
static uint64_t rng_seed=0;
static inline u32 rnd(void)
{
return u32noise2(rng_seed++);
}
static inline u32 rnd(u32 n)
{
return rnd()%n;
}
static inline u32 rnd(u32 l,u32 h)
{
return l+rnd(h-l);
}
static inline u32 rnd12(void)
{
return ONE+(rnd()&M);
}
//==============================================================================
// Main program.
int main(int argc,char **argv)
{
for(int i=1;i<argc;++i)
{
if(!strcmp(argv[i],"--no-input")) include_input =false;
if(!strcmp(argv[i],"--no-output")) include_output=false;
if(!strncmp(argv[i],"--dump=",7)) filename=argv[i]+7;
}
if(!(file=fopen(filename,"wb"))) return 1;
//==============================================================================
message("PSP vdot testcase generator.");
message("Version 1.0.");
if(true)
{
message("[1;2)");
for(int k=0;k<1024;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=rnd12();
emit(t+0,t+4);
}
}
if(true)
{
message("[1;2), perm");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=rnd12();
emit_all_permutations(t+0,t+4);
}
}
if(true)
{
message("+- [1;2)");
for(int k=0;k<1024;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S;}
emit(t+0,t+4);
}
}
if(true)
{
message("+- [1;2), perm");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S;}
emit_all_permutations(t+0,t+4);
}
}
if(true)
{
message("+- [1/256;256)");
for(int k=0;k<4096;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S; t[i]+=D*rnd(-u32(8),+u32(8));}
emit(t+0,t+4);
}
}
if(true)
{
message("+- [1/256;256), perm");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) {t[i]=rnd12(); t[i]|=rnd()&S; t[i]+=D*rnd(-u32(8),+u32(8));}
emit_all_permutations(t+0,t+4);
}
}
if(true)
{
message("0*0");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=((k>>i)&1?S:0);
emit(t+0,t+4);
}
}
if(true)
{
message("1*0");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=((k>>i)&1?S:0)|(i<4?ONE:0);
emit(t+0,t+4);
}
}
if(true)
{
message("Random");
for(int k=0;k<4096;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=rnd();
emit(t+0,t+4);
}
}
if(true)
{
message("Random, perm");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<8;++i) t[i]=rnd();
emit_all_permutations(t+0,t+4);
}
}
if(true)
{
message("1+x");
for(int k=0;k<256;++k)
{
u32 t[8]={ONE,ONE,0,0,ONE,0,0,0};
t[5]=rnd12()+2*D;
for(int p=2;p>-32;--p)
{
emit(t+0,t+4);
t[5]-=D;
}
}
}
if(true)
{
message("1-1+x");
for(int k=0;k<256;++k)
{
u32 t[8]={ONE,ONE,ONE,0,ONE,S|ONE,0,0};
t[6]=rnd12()+2*D;
for(int p=2;p>-32;--p)
{
emit(t+0,t+4);
t[6]-=D;
}
}
}
if(true)
{
message("1+x-1+y");
for(int k=0;k<256;++k)
{
u32 t[8]={ONE,ONE,ONE,ONE,ONE,0,S|ONE,0};
t[5]=rnd12()+2*D;
t[7]=rnd12()+2*D;
t[5]|=(S&rnd());
t[7]|=(S&rnd());
for(int p=2;p>-32;--p)
{
emit(t+0,t+4);
t[5]-=D;
t[7]-=D;
}
}
}
if(true)
{
message("1-(1-x)*(1+x)");
for(int k=0;k<256;++k)
{
u32 t[8]={ONE,0,0,0,ONE,0,0,0};
for(int p=22;p>=1;--p)
{
u32 x=rnd(u32(1)<<p);
t[1]=ONE+x;
t[5]=S|(ONE-2*x);
emit(t+0,t+4);
}
}
}
if(true)
{
message("1-(1+x)*(1+y)");
for(int k=0;k<256;++k)
{
u32 t[8]={ONE,0,0,0,ONE,0,0,0};
for(int p=22;p>=1;--p)
{
t[1]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p));
t[5]=S|(ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)));
emit(t+0,t+4);
}
}
}
if(true)
{
message("A*B-C*D");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
for(int p=20;p>=1;--p)
{
t[0]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p));
t[1]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p));
t[4]= ONE+rnd(-(u32(1)<<p),+(u32(1)<<p));
t[5]=S|(ONE+rnd(-(u32(1)<<p),+(u32(1)<<p)));
emit(t+0,t+4);
}
}
}
if(true)
{
message("2^n*(A*B-C*D)");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
u32 c=4096;
u32 xA=rnd(c);
u32 xB=rnd(c);
u32 xC=rnd(c);
u32 xD=rnd(c);
for(u32 p=0;p<256;++p)
{
u32 e=p*D;
t[0]= e+xA;
t[1]= e+xB;
t[4]= e+xC;
t[5]=S|(e+xD);
emit(t+0,t+4);
}
}
}
if(true)
{
message("A*B-C*D, low 0s");
for(int k=0;k<256;++k)
{
u32 t[8]={0};
t[0]= rnd12();
t[1]= rnd12();
t[4]= rnd12();
t[5]=S|(rnd12());
for(int p=1;p<22;++p)
{
t[0]&=(u32(-1)<<p);
t[1]&=(u32(-1)<<p);
t[4]&=(u32(-1)<<p);
t[5]&=(u32(-1)<<p);
emit(t+0,t+4);
}
}
}
if(true)
{
message("1+k*2^n");
for(u32 k=0,p=8;k<(u32(1)<<p);++k)
{
u32 t[8]={ONE,ONE,0,0,ONE,0,0,0};
t[5]=ONE+(k<<(23-p));
for(int n=0;n>-32;--n)
{
emit(t+0,t+4);
t[5]-=D;
}
}
}
if(true)
{
message("0/1/inf/nan");
u32 s[4]={0,ONE,E,E+1};
for(u32 k=0;k<256;++k)
{
u32 t[8]={0};
for(int i=0;i<4;++i) t[(i&1)+4*(i>>1)]=s[(k>>(2*i))&3];
emit(t+0,t+4);
}
}
//==============================================================================
fclose(file);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment