Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active December 31, 2025 15:36
Show Gist options
  • Select an option

  • Save vurtun/189784d4ffc5e24865e2f41de46026a7 to your computer and use it in GitHub Desktop.

Select an option

Save vurtun/189784d4ffc5e24865e2f41de46026a7 to your computer and use it in GitHub Desktop.
3D High-Performance Hard-Realtime Cloth Simulation
/*
* -----------------------------------------------------------------------------
* High-Performance Hard-Realtime Cloth Simulation
* -----------------------------------------------------------------------------
* DESCRIPTION:
* This module implements a production-hardened CPU cloth
* solver designed for AA/AAA open-world game. It utilizes
* NvCloth cooking data (topology) but replaces the runtime with a highly
* optimized, cache-aware, and predictable AVX2 implementation.
*
* Target Performance: < 3.0ms total frame time for 128 active cloths on
* mainstream hardware (Ryzen 3000 / Intel 8th Gen), scaling linearly with
* core count.
*
* PHYSICS MODEL:
* - Framework: XPBD (Extended Position Based Dynamics) with Graph-Colored
* Gauss-Seidel solving.
* - Compliance: Stiffness is timestep-invariant (alpha = 1 / (k * dt^2)).
* - Aerodynamics: Triangle-based Drag & Lift model with occlusion approximation.
* - Coupling: Full Rigid Body coupling including Centrifugal, Euler, and
* Coriolis fictitious forces.
* - Constraints: Distance (with compression/stretch limits), Unilateral Tethers
* (long-range attachment safety), and Sphere Collisions.
*
* ARCHITECTURAL DESIGN DECISIONS:
* 1. HYBRID MEMORY LAYOUT (SoA + AoS):
* - Batch Data (SoA): Rigid body transforms and global forces are stored in
* Structure-of-Arrays to allow processing 8 cloths simultaneously using
* pure AVX2 registers.
* - Solver Data (AoS): Particle state is stored in Per-Cloth Structs
* (~15KB). This fits entirely in L1 Cache, making random access (Gather)
* extremely fast (~4 cycles) effectively neutralizing scatter/gather cost.
*
* 2. HARD REAL-TIME GUARANTEES:
* - Zero Memory Allocation: All memory is statically defined. No malloc/free.
* - Zero Variance: All loops (particles, tethers, colliders) have fixed
* bounds. There are no dynamic branches based on active constraints.
* This ensures a "flat line" in profiling, critical for consistent frame pacing.
*
* 3. SOLVER STRATEGY (Gauss-Seidel):
* - Unlike Jacobi solvers, this implementation applies position corrections
* immediately within the solver loop. This requires Graph Coloring (Phases)
* to be thread-safe, but results in significantly faster convergence
* (tighter cloth) than Jacobi.
*
* USAGE:
* 1. Call `cloth_pre_batch_update(dt)` on the main thread (Very fast).
* 2. Dispatch `cloth_solve_range(..., start, end)` across worker threads
* via the Job System.
*
* CONSTRAINTS:
* - Max Cloths: 128
* - Max Particles per Cloth: 256 (Simulation mesh, skin visual mesh to this)
* - SIMD: Requires AVX2 (Haswell+).
* - Code Style: MISRA-C compliant (restrict, const correctness, no hidden loops).
* -----------------------------------------------------------------------------
*/
#include <immintrin.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <stdalign.h>
#include <stdint.h>
// -----------------------------------------------------------------------------
// Configuration & Constants (Target: < 3ms for 128 Cloths on 8th Gen HW)
// -----------------------------------------------------------------------------
// 128 active cloths. 128 / 8 = 16 AVX2 batches.
#define MAX_CLOTHS 128
// 256 vertices (16x16 grid).
// Struct size: ~15KB (Fits entirely in L1 Cache per cloth).
#define MAX_PARTICLES 256
#define MAX_TRIS 768
#define MAX_CONSTRAINTS 2048
#define MAX_PHASES 16
#define MAX_FABRICS 128
#define MAX_SPHERES 16
#define MAX_ANCHORS 256
#define MAX_TETHERS 256
// -----------------------------------------------------------------------------
// Alignment & Tuning
// -----------------------------------------------------------------------------
#define ALIGNMENT 32
#define EPSILON 0.0001f
#define PARTICLES_PER_BATCH 64
#define CONSTRAINT_BATCH_SIZE 256
#define TRIANGLE_BATCH_SIZE 256
#define SUBSTEPS 4
// Runtime validation macros (Zero overhead in Release)
#ifndef NDEBUG
#define SIM_ASSERT(x) assert(x)
#define SIM_VALIDATE(s, c) validate_state(s, c)
// Forward declaration
struct cloth_sim;
static void validate_state(const struct cloth_sim* sim, int c);
#else
#define SIM_ASSERT(x) ((void)0)
#define SIM_VALIDATE(s, c) ((void)0)
#endif
// -----------------------------------------------------------------------------
// Data Structures
// -----------------------------------------------------------------------------
struct cloth_cfg {
float motion_stiffness;
float phases_stiffness;
float phases_compress_limit;
float phases_stretch_limit;
float drag_coeff;
float lift_coeff;
float wind_density;
float wind_friction;
float tether_stiffness;
float tether_length_scale;
float damping_coeff;
float linear_inertia[3];
float angular_inertia[3];
float centrifugal_inertia[3];
} __attribute__((aligned(32)));
struct cloth_fabric {
alignas(32) int tri_i0[MAX_TRIS];
alignas(32) int tri_i1[MAX_TRIS];
alignas(32) int tri_i2[MAX_TRIS];
alignas(32) int idx0[MAX_CONSTRAINTS];
alignas(32) int idx1[MAX_CONSTRAINTS];
alignas(32) float rest_vals[MAX_CONSTRAINTS];
alignas(32) float stiff_vals[MAX_CONSTRAINTS];
alignas(32) int anchors[MAX_ANCHORS];
alignas(32) float tether_lens[MAX_TETHERS];
alignas(32) int particle_idx[MAX_TETHERS];
alignas(32) int phase_idx[MAX_PHASES];
alignas(32) int sets[MAX_PHASES];
};
struct cloth_batch_data {
alignas(32) float mdl_pos_x[MAX_CLOTHS];
alignas(32) float mdl_pos_y[MAX_CLOTHS];
alignas(32) float mdl_pos_z[MAX_CLOTHS];
alignas(32) float mdl_rot_x[MAX_CLOTHS];
alignas(32) float mdl_rot_y[MAX_CLOTHS];
alignas(32) float mdl_rot_z[MAX_CLOTHS];
alignas(32) float mdl_rot_w[MAX_CLOTHS];
alignas(32) float prev_mdl_pos_x[MAX_CLOTHS];
alignas(32) float prev_mdl_pos_y[MAX_CLOTHS];
alignas(32) float prev_mdl_pos_z[MAX_CLOTHS];
alignas(32) float prev_mdl_rot_x[MAX_CLOTHS];
alignas(32) float prev_mdl_rot_y[MAX_CLOTHS];
alignas(32) float prev_mdl_rot_z[MAX_CLOTHS];
alignas(32) float prev_mdl_rot_w[MAX_CLOTHS];
alignas(32) float local_gravity_x[MAX_CLOTHS];
alignas(32) float local_gravity_y[MAX_CLOTHS];
alignas(32) float local_gravity_z[MAX_CLOTHS];
alignas(32) float local_wind_x[MAX_CLOTHS];
alignas(32) float local_wind_y[MAX_CLOTHS];
alignas(32) float local_wind_z[MAX_CLOTHS];
alignas(32) float lin_vel_x[MAX_CLOTHS];
alignas(32) float lin_vel_y[MAX_CLOTHS];
alignas(32) float lin_vel_z[MAX_CLOTHS];
alignas(32) float ang_vel_x[MAX_CLOTHS];
alignas(32) float ang_vel_y[MAX_CLOTHS];
alignas(32) float ang_vel_z[MAX_CLOTHS];
};
struct cloth_instance_state {
alignas(32) float p_pos_x[MAX_PARTICLES];
alignas(32) float p_pos_y[MAX_PARTICLES];
alignas(32) float p_pos_z[MAX_PARTICLES];
alignas(32) float p_rmass[MAX_PARTICLES];
alignas(32) float p_pos_x_prev[MAX_PARTICLES];
alignas(32) float p_pos_y_prev[MAX_PARTICLES];
alignas(32) float p_pos_z_prev[MAX_PARTICLES];
alignas(32) float p_rest_x[MAX_PARTICLES];
alignas(32) float p_rest_y[MAX_PARTICLES];
alignas(32) float p_rest_z[MAX_PARTICLES];
alignas(32) float p_rest_w[MAX_PARTICLES];
alignas(32) float p_wnd_frc_x[MAX_PARTICLES];
alignas(32) float p_wnd_frc_y[MAX_PARTICLES];
alignas(32) float p_wnd_frc_z[MAX_PARTICLES];
};
struct cloth_sim {
int fab_idx[MAX_CLOTHS];
struct cloth_cfg cfgs[MAX_CLOTHS];
struct cloth_fabric fabs[MAX_FABRICS];
// data layout split for optimal access patterns
struct cloth_batch_data batch_data;
struct cloth_instance_state cloths[MAX_CLOTHS];
// colliders
alignas(32) float sphere_pos_x[MAX_CLOTHS * MAX_SPHERES];
alignas(32) float sphere_pos_y[MAX_CLOTHS * MAX_SPHERES];
alignas(32) float sphere_pos_z[MAX_CLOTHS * MAX_SPHERES];
alignas(32) float sphere_radius[MAX_CLOTHS * MAX_SPHERES];
alignas(32) float global_gravity[4];
alignas(32) float global_wind[4];
} __attribute__((aligned(64)));
_Static_assert(sizeof(struct cloth_instance_state) % 32 == 0, "Instance state misalignment");
// -----------------------------------------------------------------------------
// Helper Intrinsics
// -----------------------------------------------------------------------------
static inline __attribute__((always_inline)) __m256
mm256_rsqrt_nr_ps(__m256 x) {
// Newton-Raphson refinement for rsqrt (Increases precision 10^-4 -> 10^-7)
__m256 nr = _mm256_rsqrt_ps(x);
__m256 muls = _mm256_mul_ps(_mm256_mul_ps(nr, nr), x);
__m256 beta = _mm256_mul_ps(_mm256_sub_ps(_mm256_set1_ps(3.0f), muls), _mm256_set1_ps(0.5f));
return _mm256_mul_ps(nr, beta);
}
// Rotates vector by inverse quaternion (Requires negated q.xyz inputs)
static inline __attribute__((always_inline)) void
avx_rotate_vector_inverse(float vx, float vy, float vz,
__m256 nqx, __m256 nqy, __m256 nqz, __m256 qw,
__m256* out_x, __m256* out_y, __m256* out_z) {
const __m256 g_vx = _mm256_set1_ps(vx);
const __m256 g_vy = _mm256_set1_ps(vy);
const __m256 g_vz = _mm256_set1_ps(vz);
const __m256 two = _mm256_set1_ps(2.0f);
__m256 tx = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqy, g_vz), _mm256_mul_ps(nqz, g_vy)));
__m256 ty = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqz, g_vx), _mm256_mul_ps(nqx, g_vz)));
__m256 tz = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqx, g_vy), _mm256_mul_ps(nqy, g_vx)));
*out_x = _mm256_add_ps(g_vx, _mm256_add_ps(_mm256_mul_ps(qw, tx),
_mm256_sub_ps(_mm256_mul_ps(nqy, tz), _mm256_mul_ps(nqz, ty))));
*out_y = _mm256_add_ps(g_vy, _mm256_add_ps(_mm256_mul_ps(qw, ty),
_mm256_sub_ps(_mm256_mul_ps(nqz, tx), _mm256_mul_ps(nqx, tz))));
*out_z = _mm256_add_ps(g_vz, _mm256_add_ps(_mm256_mul_ps(qw, tz),
_mm256_sub_ps(_mm256_mul_ps(nqx, ty), _mm256_mul_ps(nqy, tx))));
}
// -----------------------------------------------------------------------------
// Validation (Debug)
// -----------------------------------------------------------------------------
#ifndef NDEBUG
static void
validate_state(const struct cloth_sim* sim, int c) {
SIM_ASSERT(sim != NULL);
SIM_ASSERT(0 <= c && c < MAX_CLOTHS);
const struct cloth_instance_state* st = &sim->cloths[c];
for (int i = 0; i < MAX_PARTICLES; ++i) {
SIM_ASSERT(isfinite(st->p_pos_x[i]));
SIM_ASSERT(isfinite(st->p_pos_y[i]));
SIM_ASSERT(isfinite(st->p_pos_z[i]));
}
}
#endif
// -----------------------------------------------------------------------------
// Phase 1: Batch Setup (Rigid Body Math)
// -----------------------------------------------------------------------------
static void
compute_batch_setup(struct cloth_sim* restrict sim, int base_c, float dt) {
struct cloth_batch_data* restrict bd = &sim->batch_data;
const __m256 inv_dt = _mm256_rcp_ps(_mm256_max_ps(_mm256_set1_ps(dt), _mm256_set1_ps(EPSILON)));
const __m256 two = _mm256_set1_ps(2.0f);
const __m256 zero = _mm256_setzero_ps();
__m256 px = _mm256_load_ps(&bd->mdl_pos_x[base_c]);
__m256 py = _mm256_load_ps(&bd->mdl_pos_y[base_c]);
__m256 pz = _mm256_load_ps(&bd->mdl_pos_z[base_c]);
__m256 qx = _mm256_load_ps(&bd->mdl_rot_x[base_c]);
__m256 qy = _mm256_load_ps(&bd->mdl_rot_y[base_c]);
__m256 qz = _mm256_load_ps(&bd->mdl_rot_z[base_c]);
__m256 qw = _mm256_load_ps(&bd->mdl_rot_w[base_c]);
__m256 prev_px = _mm256_load_ps(&bd->prev_mdl_pos_x[base_c]);
__m256 prev_py = _mm256_load_ps(&bd->prev_mdl_pos_y[base_c]);
__m256 prev_pz = _mm256_load_ps(&bd->prev_mdl_pos_z[base_c]);
__m256 prev_qx = _mm256_load_ps(&bd->prev_mdl_rot_x[base_c]);
__m256 prev_qy = _mm256_load_ps(&bd->prev_mdl_rot_y[base_c]);
__m256 prev_qz = _mm256_load_ps(&bd->prev_mdl_rot_z[base_c]);
__m256 prev_qw = _mm256_load_ps(&bd->prev_mdl_rot_w[base_c]);
float lin_in_x[8], lin_in_y[8], lin_in_z[8];
float ang_in_x[8], ang_in_y[8], ang_in_z[8];
for (int k=0; k<8; ++k) {
const struct cloth_cfg* cfg = &sim->cfgs[base_c+k];
lin_in_x[k] = cfg->linear_inertia[0];
lin_in_y[k] = cfg->linear_inertia[1];
lin_in_z[k] = cfg->linear_inertia[2];
ang_in_x[k] = cfg->angular_inertia[0];
ang_in_y[k] = cfg->angular_inertia[1];
ang_in_z[k] = cfg->angular_inertia[2];
}
__m256 lv_x = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(px, prev_px), inv_dt), _mm256_loadu_ps(lin_in_x));
__m256 lv_y = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(py, prev_py), inv_dt), _mm256_loadu_ps(lin_in_y));
__m256 lv_z = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(pz, prev_pz), inv_dt), _mm256_loadu_ps(lin_in_z));
_mm256_store_ps(&bd->lin_vel_x[base_c], lv_x);
_mm256_store_ps(&bd->lin_vel_y[base_c], lv_y);
_mm256_store_ps(&bd->lin_vel_z[base_c], lv_z);
__m256 n_pqx = _mm256_sub_ps(zero, prev_qx);
__m256 n_pqy = _mm256_sub_ps(zero, prev_qy);
__m256 n_pqz = _mm256_sub_ps(zero, prev_qz);
__m256 rx = _mm256_add_ps(_mm256_mul_ps(qw, n_pqx), _mm256_add_ps(_mm256_mul_ps(qx, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qy, n_pqz), _mm256_mul_ps(qz, n_pqy))));
__m256 ry = _mm256_add_ps(_mm256_mul_ps(qw, n_pqy), _mm256_add_ps(_mm256_mul_ps(qy, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qz, n_pqx), _mm256_mul_ps(qx, n_pqz))));
__m256 rz = _mm256_add_ps(_mm256_mul_ps(qw, n_pqz), _mm256_add_ps(_mm256_mul_ps(qz, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qx, n_pqy), _mm256_mul_ps(qy, n_pqx))));
__m256 scale = _mm256_mul_ps(two, inv_dt);
_mm256_store_ps(&bd->ang_vel_x[base_c], _mm256_mul_ps(_mm256_mul_ps(rx, scale), _mm256_loadu_ps(ang_in_x)));
_mm256_store_ps(&bd->ang_vel_y[base_c], _mm256_mul_ps(_mm256_mul_ps(ry, scale), _mm256_loadu_ps(ang_in_y)));
_mm256_store_ps(&bd->ang_vel_z[base_c], _mm256_mul_ps(_mm256_mul_ps(rz, scale), _mm256_loadu_ps(ang_in_z)));
__m256 nqx = _mm256_sub_ps(zero, qx);
__m256 nqy = _mm256_sub_ps(zero, qy);
__m256 nqz = _mm256_sub_ps(zero, qz);
__m256 lgx, lgy, lgz;
avx_rotate_vector_inverse(sim->global_gravity[0], sim->global_gravity[1], sim->global_gravity[2], nqx, nqy, nqz, qw, &lgx, &lgy, &lgz);
_mm256_store_ps(&bd->local_gravity_x[base_c], lgx);
_mm256_store_ps(&bd->local_gravity_y[base_c], lgy);
_mm256_store_ps(&bd->local_gravity_z[base_c], lgz);
__m256 lwx, lwy, lwz;
avx_rotate_vector_inverse(sim->global_wind[0], sim->global_wind[1], sim->global_wind[2], nqx, nqy, nqz, qw, &lwx, &lwy, &lwz);
_mm256_store_ps(&bd->local_wind_x[base_c], lwx);
_mm256_store_ps(&bd->local_wind_y[base_c], lwy);
_mm256_store_ps(&bd->local_wind_z[base_c], lwz);
_mm256_store_ps(&bd->prev_mdl_pos_x[base_c], px);
_mm256_store_ps(&bd->prev_mdl_pos_y[base_c], py);
_mm256_store_ps(&bd->prev_mdl_pos_z[base_c], pz);
_mm256_store_ps(&bd->prev_mdl_rot_x[base_c], qx);
_mm256_store_ps(&bd->prev_mdl_rot_y[base_c], qy);
_mm256_store_ps(&bd->prev_mdl_rot_z[base_c], qz);
_mm256_store_ps(&bd->prev_mdl_rot_w[base_c], qw);
}
// -----------------------------------------------------------------------------
// Phase 2: Per-Cloth Solver Functions
// -----------------------------------------------------------------------------
static void
compute_wind_forces(struct cloth_sim* restrict sim, int c, float dt) {
int setup_idx = sim->fab_idx[c];
const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
struct cloth_instance_state* restrict st = &sim->cloths[c];
struct cloth_batch_data* restrict bd = &sim->batch_data;
const __m256 inv_dt_v = _mm256_set1_ps((dt > EPSILON) ? 1.0f / dt : 0.0f);
const __m256 third = _mm256_set1_ps(1.0f / 3.0f);
const __m256 half = _mm256_set1_ps(0.5f);
const __m256 eps_sq = _mm256_set1_ps(EPSILON * EPSILON);
const __m256 drag_lift = _mm256_set1_ps(sim->cfgs[c].drag_coeff + sim->cfgs[c].lift_coeff);
const __m256 w_rho = _mm256_set1_ps(sim->cfgs[c].wind_density);
const __m256 wx = _mm256_set1_ps(bd->local_wind_x[c]);
const __m256 wy = _mm256_set1_ps(bd->local_wind_y[c]);
const __m256 wz = _mm256_set1_ps(bd->local_wind_z[c]);
// AVX2 Clear (Fixed Loop)
const __m256 zero_v = _mm256_setzero_ps();
for (int i = 0; i < MAX_PARTICLES; i += 8) {
_mm256_store_ps(&st->p_wnd_frc_x[i], zero_v);
_mm256_store_ps(&st->p_wnd_frc_y[i], zero_v);
_mm256_store_ps(&st->p_wnd_frc_z[i], zero_v);
}
for (int t_base = 0; t_base < MAX_TRIS; t_base += TRIANGLE_BATCH_SIZE) {
for (int t_offset = 0; t_offset < TRIANGLE_BATCH_SIZE; t_offset += 8) {
int t = t_base + t_offset;
__m256i i0 = _mm256_loadu_si256((__m256i*)&fab->tri_i0[t]);
__m256i i1 = _mm256_loadu_si256((__m256i*)&fab->tri_i1[t]);
__m256i i2 = _mm256_loadu_si256((__m256i*)&fab->tri_i2[t]);
__m256 p0x = _mm256_i32gather_ps(st->p_pos_x, i0, 4);
__m256 p0y = _mm256_i32gather_ps(st->p_pos_y, i0, 4);
__m256 p0z = _mm256_i32gather_ps(st->p_pos_z, i0, 4);
__m256 p1x = _mm256_i32gather_ps(st->p_pos_x, i1, 4);
__m256 p1y = _mm256_i32gather_ps(st->p_pos_y, i1, 4);
__m256 p1z = _mm256_i32gather_ps(st->p_pos_z, i1, 4);
__m256 p2x = _mm256_i32gather_ps(st->p_pos_x, i2, 4);
__m256 p2y = _mm256_i32gather_ps(st->p_pos_y, i2, 4);
__m256 p2z = _mm256_i32gather_ps(st->p_pos_z, i2, 4);
__m256 pr0x = _mm256_i32gather_ps(st->p_pos_x_prev, i0, 4);
__m256 pr0y = _mm256_i32gather_ps(st->p_pos_y_prev, i0, 4);
__m256 pr0z = _mm256_i32gather_ps(st->p_pos_z_prev, i0, 4);
__m256 pr1x = _mm256_i32gather_ps(st->p_pos_x_prev, i1, 4);
__m256 pr1y = _mm256_i32gather_ps(st->p_pos_y_prev, i1, 4);
__m256 pr1z = _mm256_i32gather_ps(st->p_pos_z_prev, i1, 4);
__m256 pr2x = _mm256_i32gather_ps(st->p_pos_x_prev, i2, 4);
__m256 pr2y = _mm256_i32gather_ps(st->p_pos_y_prev, i2, 4);
__m256 pr2z = _mm256_i32gather_ps(st->p_pos_z_prev, i2, 4);
__m256 v0x = _mm256_mul_ps(_mm256_sub_ps(p0x, pr0x), inv_dt_v);
__m256 v0y = _mm256_mul_ps(_mm256_sub_ps(p0y, pr0y), inv_dt_v);
__m256 v0z = _mm256_mul_ps(_mm256_sub_ps(p0z, pr0z), inv_dt_v);
__m256 v1x = _mm256_mul_ps(_mm256_sub_ps(p1x, pr1x), inv_dt_v);
__m256 v1y = _mm256_mul_ps(_mm256_sub_ps(p1y, pr1y), inv_dt_v);
__m256 v1z = _mm256_mul_ps(_mm256_sub_ps(p1z, pr1z), inv_dt_v);
__m256 v2x = _mm256_mul_ps(_mm256_sub_ps(p2x, pr2x), inv_dt_v);
__m256 v2y = _mm256_mul_ps(_mm256_sub_ps(p2y, pr2y), inv_dt_v);
__m256 v2z = _mm256_mul_ps(_mm256_sub_ps(p2z, pr2z), inv_dt_v);
__m256 tri_vx = _mm256_mul_ps(_mm256_add_ps(v0x, _mm256_add_ps(v1x, v2x)), third);
__m256 tri_vy = _mm256_mul_ps(_mm256_add_ps(v0y, _mm256_add_ps(v1y, v2y)), third);
__m256 tri_vz = _mm256_mul_ps(_mm256_add_ps(v0z, _mm256_add_ps(v1z, v2z)), third);
__m256 e1x = _mm256_sub_ps(p1x, p0x);
__m256 e1y = _mm256_sub_ps(p1y, p0y);
__m256 e1z = _mm256_sub_ps(p1z, p0z);
__m256 e2x = _mm256_sub_ps(p2x, p0x);
__m256 e2y = _mm256_sub_ps(p2y, p0y);
__m256 e2z = _mm256_sub_ps(p2z, p0z);
__m256 nx = _mm256_fmsub_ps(e1y, e2z, _mm256_mul_ps(e1z, e2y));
__m256 ny = _mm256_fmsub_ps(e1z, e2x, _mm256_mul_ps(e1x, e2z));
__m256 nz = _mm256_fmsub_ps(e1x, e2y, _mm256_mul_ps(e1y, e2x));
__m256 len_sq = _mm256_fmadd_ps(nz, nz, _mm256_fmadd_ps(ny, ny, _mm256_mul_ps(nx, nx)));
__m256 inv_nl = mm256_rsqrt_nr_ps(_mm256_max_ps(len_sq, eps_sq));
nx = _mm256_mul_ps(nx, inv_nl);
ny = _mm256_mul_ps(ny, inv_nl);
nz = _mm256_mul_ps(nz, inv_nl);
__m256 area = _mm256_mul_ps(_mm256_mul_ps(_mm256_max_ps(len_sq, eps_sq), inv_nl), half);
__m256 rvx = _mm256_sub_ps(wx, tri_vx);
__m256 rvy = _mm256_sub_ps(wy, tri_vy);
__m256 rvz = _mm256_sub_ps(wz, tri_vz);
__m256 dot = _mm256_fmadd_ps(nz, rvz, _mm256_fmadd_ps(ny, rvy, _mm256_mul_ps(nx, rvx)));
__m256 P = _mm256_mul_ps(w_rho, dot);
__m256 fx = _mm256_mul_ps(_mm256_mul_ps(P, area), nx);
__m256 fy = _mm256_mul_ps(_mm256_mul_ps(P, area), ny);
__m256 fz = _mm256_mul_ps(_mm256_mul_ps(P, area), nz);
__m256 v_sq = _mm256_fmadd_ps(rvz, rvz, _mm256_fmadd_ps(rvy, rvy, _mm256_mul_ps(rvx, rvx)));
__m256 DL = _mm256_mul_ps(drag_lift, _mm256_mul_ps(v_sq, area));
fx = _mm256_fmadd_ps(DL, nx, fx); fy = _mm256_fmadd_ps(DL, ny, fy); fz = _mm256_fmadd_ps(DL, nz, fz);
__m256 fx3 = _mm256_mul_ps(fx, third);
__m256 fy3 = _mm256_mul_ps(fy, third);
__m256 fz3 = _mm256_mul_ps(fz, third);
float bx[8], by[8], bz[8];
_mm256_store_ps(bx, fx3);
_mm256_store_ps(by, fy3);
_mm256_store_ps(bz, fz3);
int is0[8], is1[8], is2[8];
_mm256_store_si256((__m256i*)is0, i0);
_mm256_store_si256((__m256i*)is1, i1);
_mm256_store_si256((__m256i*)is2, i2);
for(int k=0; k<8; ++k) {
st->p_wnd_frc_x[is0[k]] += bx[k];
st->p_wnd_frc_y[is0[k]] += by[k];
st->p_wnd_frc_z[is0[k]] += bz[k];
st->p_wnd_frc_x[is1[k]] += bx[k];
st->p_wnd_frc_y[is1[k]] += by[k];
st->p_wnd_frc_z[is1[k]] += bz[k];
st->p_wnd_frc_x[is2[k]] += bx[k];
st->p_wnd_frc_y[is2[k]] += by[k];
st->p_wnd_frc_z[is2[k]] += bz[k];
}
}
}
}
static void
integrate_batch(struct cloth_sim* restrict sim, int c, int base_idx, float dt) {
struct cloth_instance_state* restrict st = &sim->cloths[c];
struct cloth_batch_data* restrict bd = &sim->batch_data;
const float damp = sim->cfgs[c].damping_coeff;
const float dt_sq = dt * dt;
const __m256 gx = _mm256_set1_ps(bd->local_gravity_x[c] * dt_sq);
const __m256 gy = _mm256_set1_ps(bd->local_gravity_y[c] * dt_sq);
const __m256 gz = _mm256_set1_ps(bd->local_gravity_z[c] * dt_sq);
const __m256 dmp_v = _mm256_set1_ps(1.0f - damp * dt);
const __m256 dt_sq_v = _mm256_set1_ps(dt_sq);
const __m256 dt_v = _mm256_set1_ps(dt);
const __m256 zero = _mm256_setzero_ps();
const __m256 lin_dx = _mm256_set1_ps(bd->lin_vel_x[c] * dt);
const __m256 lin_dy = _mm256_set1_ps(bd->lin_vel_y[c] * dt);
const __m256 lin_dz = _mm256_set1_ps(bd->lin_vel_z[c] * dt);
const __m256 avx = _mm256_set1_ps(bd->ang_vel_x[c]);
const __m256 avy = _mm256_set1_ps(bd->ang_vel_y[c]);
const __m256 avz = _mm256_set1_ps(bd->ang_vel_z[c]);
const __m256 csx = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[0]);
const __m256 csy = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[1]);
const __m256 csz = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[2]);
for (int i = 0; i < PARTICLES_PER_BATCH; i += 8) {
int idx = base_idx + i;
__m256 px = _mm256_load_ps(&st->p_pos_x[idx]);
__m256 py = _mm256_load_ps(&st->p_pos_y[idx]);
__m256 pz = _mm256_load_ps(&st->p_pos_z[idx]);
__m256 im = _mm256_load_ps(&st->p_rmass[idx]);
__m256 fix = _mm256_cmp_ps(im, zero, _CMP_EQ_OQ);
__m256 prx = _mm256_load_ps(&st->p_pos_x_prev[idx]);
__m256 pry = _mm256_load_ps(&st->p_pos_y_prev[idx]);
__m256 prz = _mm256_load_ps(&st->p_pos_z_prev[idx]);
__m256 adx = _mm256_mul_ps(_mm256_fmsub_ps(avy, pz, _mm256_mul_ps(avz, py)), dt_v);
__m256 ady = _mm256_mul_ps(_mm256_fmsub_ps(avz, px, _mm256_mul_ps(avx, pz)), dt_v);
__m256 adz = _mm256_mul_ps(_mm256_fmsub_ps(avx, py, _mm256_mul_ps(avy, px)), dt_v);
__m256 cx = _mm256_fmsub_ps(avy, pz, _mm256_mul_ps(avz, py));
__m256 cy = _mm256_fmsub_ps(avz, px, _mm256_mul_ps(avx, pz));
__m256 cz = _mm256_fmsub_ps(avx, py, _mm256_mul_ps(avy, px));
__m256 cdx = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avy, cz, _mm256_mul_ps(avz, cy)), dt_v), csx);
__m256 cdy = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avz, cx, _mm256_mul_ps(avx, cz)), dt_v), csy);
__m256 cdz = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avx, cy, _mm256_mul_ps(avy, cx)), dt_v), csz);
prx = _mm256_sub_ps(prx, _mm256_add_ps(lin_dx, _mm256_add_ps(adx, cdx)));
pry = _mm256_sub_ps(pry, _mm256_add_ps(lin_dy, _mm256_add_ps(ady, cdy)));
prz = _mm256_sub_ps(prz, _mm256_add_ps(lin_dz, _mm256_add_ps(adz, cdz)));
__m256 wfx = _mm256_load_ps(&st->p_wnd_frc_x[idx]);
__m256 wfy = _mm256_load_ps(&st->p_wnd_frc_y[idx]);
__m256 wfz = _mm256_load_ps(&st->p_wnd_frc_z[idx]);
__m256 vx = _mm256_mul_ps(_mm256_sub_ps(px, prx), dmp_v);
__m256 vy = _mm256_mul_ps(_mm256_sub_ps(py, pry), dmp_v);
__m256 vz = _mm256_mul_ps(_mm256_sub_ps(pz, prz), dmp_v);
__m256 ax = _mm256_mul_ps(_mm256_mul_ps(wfx, im), dt_sq_v);
__m256 ay = _mm256_mul_ps(_mm256_mul_ps(wfy, im), dt_sq_v);
__m256 az = _mm256_mul_ps(_mm256_mul_ps(wfz, im), dt_sq_v);
__m256 npx = _mm256_add_ps(px, _mm256_add_ps(vx, _mm256_add_ps(gx, ax)));
__m256 npy = _mm256_add_ps(py, _mm256_add_ps(vy, _mm256_add_ps(gy, ay)));
__m256 npz = _mm256_add_ps(pz, _mm256_add_ps(vz, _mm256_add_ps(gz, az)));
_mm256_store_ps(&st->p_pos_x[idx], _mm256_blendv_ps(npx, px, fix));
_mm256_store_ps(&st->p_pos_y[idx], _mm256_blendv_ps(npy, py, fix));
_mm256_store_ps(&st->p_pos_z[idx], _mm256_blendv_ps(npz, pz, fix));
_mm256_store_ps(&st->p_pos_x_prev[idx], px);
_mm256_store_ps(&st->p_pos_y_prev[idx], py);
_mm256_store_ps(&st->p_pos_z_prev[idx], pz);
}
}
static void
compute_distance_constraints(struct cloth_sim* restrict sim, int c, float dt) {
int setup_idx = sim->fab_idx[c];
const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
struct cloth_instance_state* restrict st = &sim->cloths[c];
const __m256 cmp_lim = _mm256_set1_ps(sim->cfgs[c].phases_compress_limit);
const __m256 str_lim = _mm256_set1_ps(sim->cfgs[c].phases_stretch_limit);
const __m256 stiff = _mm256_set1_ps(sim->cfgs[c].phases_stiffness);
const __m256 dt_sq = _mm256_set1_ps(dt*dt);
const __m256 zero = _mm256_setzero_ps();
const __m256 neg_one = _mm256_set1_ps(-1.0f);
const __m256 eps = _mm256_set1_ps(EPSILON * EPSILON);
for (int phase = 0; phase < MAX_PHASES - 1; phase++) {
int pid = fab->phase_idx[phase];
int start = fab->sets[pid];
int end = fab->sets[pid+1];
for (int i = start; i < end; i += CONSTRAINT_BATCH_SIZE) {
for (int j = 0; j < CONSTRAINT_BATCH_SIZE; j += 8) {
int idx = i + j;
__m256i i0 = _mm256_loadu_si256((__m256i*)&fab->idx0[idx]);
__m256i i1 = _mm256_loadu_si256((__m256i*)&fab->idx1[idx]);
__m256 px0 = _mm256_i32gather_ps(st->p_pos_x, i0, 4);
__m256 py0 = _mm256_i32gather_ps(st->p_pos_y, i0, 4);
__m256 pz0 = _mm256_i32gather_ps(st->p_pos_z, i0, 4);
__m256 px1 = _mm256_i32gather_ps(st->p_pos_x, i1, 4);
__m256 py1 = _mm256_i32gather_ps(st->p_pos_y, i1, 4);
__m256 pz1 = _mm256_i32gather_ps(st->p_pos_z, i1, 4);
__m256 w0 = _mm256_i32gather_ps(st->p_rmass, i0, 4);
__m256 w1 = _mm256_i32gather_ps(st->p_rmass, i1, 4);
__m256 rest = _mm256_loadu_ps(&fab->rest_vals[idx]);
__m256 k_val = _mm256_loadu_ps(&fab->stiff_vals[idx]);
k_val = _mm256_blendv_ps(stiff, k_val, _mm256_cmp_ps(k_val, zero, _CMP_NEQ_OQ));
__m256 dx = _mm256_sub_ps(px1, px0);
__m256 dy = _mm256_sub_ps(py1, py0);
__m256 dz = _mm256_sub_ps(pz1, pz0);
__m256 d2 = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 safe_d2 = _mm256_max_ps(d2, eps);
__m256 inv_l = mm256_rsqrt_nr_ps(safe_d2);
__m256 len = _mm256_mul_ps(safe_d2, inv_l);
__m256 delta = _mm256_sub_ps(len, rest);
__m256 d_clamped = _mm256_blendv_ps(_mm256_blendv_ps(delta, _mm256_mul_ps(delta, cmp_lim), _mm256_cmp_ps(delta, zero, _CMP_LT_OQ)), _mm256_mul_ps(delta, str_lim), _mm256_cmp_ps(delta, zero, _CMP_GT_OQ));
__m256 alpha = _mm256_div_ps(_mm256_rcp_ps(k_val), dt_sq);
__m256 factor = _mm256_mul_ps(neg_one, _mm256_div_ps(d_clamped, _mm256_add_ps(_mm256_add_ps(w0, w1), alpha)));
__m256 cx = _mm256_mul_ps(dx, _mm256_mul_ps(factor, inv_l));
__m256 cy = _mm256_mul_ps(dy, _mm256_mul_ps(factor, inv_l));
__m256 cz = _mm256_mul_ps(dz, _mm256_mul_ps(factor, inv_l));
float bx[8], by[8], bz[8], bw0[8], bw1[8];
_mm256_store_ps(bx, cx);
_mm256_store_ps(by, cy);
_mm256_store_ps(bz, cz);
_mm256_store_ps(bw0, w0);
_mm256_store_ps(bw1, w1);
int bi0[8], bi1[8];
_mm256_store_si256((__m256i*)bi0, i0);
_mm256_store_si256((__m256i*)bi1, i1);
for (int k=0; k<8; ++k) {
st->p_pos_x[bi0[k]] -= bw0[k]*bx[k];
st->p_pos_y[bi0[k]] -= bw0[k]*by[k];
st->p_pos_z[bi0[k]] -= bw0[k]*bz[k];
st->p_pos_x[bi1[k]] += bw1[k]*bx[k];
st->p_pos_y[bi1[k]] += bw1[k]*by[k];
st->p_pos_z[bi1[k]] += bw1[k]*bz[k];
}
}
}
}
}
static void
compute_tether_constraints(struct cloth_sim* restrict sim, int c, float dt) {
int setup_idx = sim->fab_idx[c];
const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
struct cloth_instance_state* restrict st = &sim->cloths[c];
const __m256 dt_sq = _mm256_set1_ps(dt * dt);
const __m256 zero = _mm256_setzero_ps();
const __m256 eps = _mm256_set1_ps(EPSILON * EPSILON);
const __m256 stiffness = _mm256_set1_ps(sim->cfgs[c].tether_stiffness);
const __m256 len_scale = _mm256_set1_ps(sim->cfgs[c].tether_length_scale);
const __m256 alpha = _mm256_div_ps(_mm256_rcp_ps(stiffness), dt_sq);
for (int i = 0; i < MAX_TETHERS; i += 8) {
__m256i p_idx = _mm256_loadu_si256((__m256i*)&fab->particle_idx[i]);
__m256i a_idx = _mm256_loadu_si256((__m256i*)&fab->anchors[i]);
__m256 px = _mm256_i32gather_ps(st->p_pos_x, p_idx, 4);
__m256 py = _mm256_i32gather_ps(st->p_pos_y, p_idx, 4);
__m256 pz = _mm256_i32gather_ps(st->p_pos_z, p_idx, 4);
__m256 w0 = _mm256_i32gather_ps(st->p_rmass, p_idx, 4);
__m256 ax = _mm256_i32gather_ps(st->p_pos_x, a_idx, 4);
__m256 ay = _mm256_i32gather_ps(st->p_pos_y, a_idx, 4);
__m256 az = _mm256_i32gather_ps(st->p_pos_z, a_idx, 4);
__m256 w1 = _mm256_i32gather_ps(st->p_rmass, a_idx, 4);
__m256 rest = _mm256_mul_ps(_mm256_loadu_ps(&fab->tether_lens[i]), len_scale);
__m256 dx = _mm256_sub_ps(px, ax);
__m256 dy = _mm256_sub_ps(py, ay);
__m256 dz = _mm256_sub_ps(pz, az);
__m256 d2 = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 safe_d2 = _mm256_max_ps(d2, eps);
__m256 inv_l = mm256_rsqrt_nr_ps(safe_d2);
__m256 len = _mm256_mul_ps(safe_d2, inv_l);
__m256 delta = _mm256_sub_ps(len, rest);
// Unilateral: Only correct if stretched (len > rest)
__m256 mask = _mm256_cmp_ps(delta, zero, _CMP_GT_OQ);
__m256 w_sum = _mm256_add_ps(_mm256_add_ps(w0, w1), alpha);
__m256 factor = _mm256_div_ps(_mm256_mul_ps(delta, _mm256_set1_ps(-1.0f)), w_sum);
factor = _mm256_and_ps(factor, mask);
__m256 cx = _mm256_mul_ps(dx, _mm256_mul_ps(factor, inv_l));
__m256 cy = _mm256_mul_ps(dy, _mm256_mul_ps(factor, inv_l));
__m256 cz = _mm256_mul_ps(dz, _mm256_mul_ps(factor, inv_l));
float bx[8], by[8], bz[8], bw0[8], bw1[8];
_mm256_store_ps(bx, cx);
_mm256_store_ps(by, cy);
_mm256_store_ps(bz, cz);
_mm256_store_ps(bw0, w0);
_mm256_store_ps(bw1, w1);
int bi_p[8], bi_a[8];
_mm256_store_si256((__m256i*)bi_p, p_idx);
_mm256_store_si256((__m256i*)bi_a, a_idx);
for(int k=0; k<8; ++k) {
st->p_pos_x[bi_p[k]] -= bw0[k]*bx[k];
st->p_pos_y[bi_p[k]] -= bw0[k]*by[k];
st->p_pos_z[bi_p[k]] -= bw0[k]*bz[k];
st->p_pos_x[bi_a[k]] += bw1[k]*bx[k];
st->p_pos_y[bi_a[k]] += bw1[k]*by[k];
st->p_pos_z[bi_a[k]] += bw1[k]*bz[k];
}
}
}
static void
compute_collisions_and_finalize(struct cloth_sim* restrict sim, int c, int base_idx, float dt) {
struct cloth_instance_state* restrict st = &sim->cloths[c];
const __m256 inv_dt = _mm256_set1_ps((dt > EPSILON) ? 1.0f/dt : 0.0f);
const __m256 dt_v = _mm256_set1_ps(dt);
const __m256 zero = _mm256_setzero_ps();
const __m256 eps = _mm256_set1_ps(EPSILON*EPSILON);
const __m256 friction = _mm256_set1_ps(0.1f);
for (int i = 0; i < PARTICLES_PER_BATCH; i += 8) {
int idx = base_idx + i;
__m256 px = _mm256_load_ps(&st->p_pos_x[idx]);
__m256 py = _mm256_load_ps(&st->p_pos_y[idx]);
__m256 pz = _mm256_load_ps(&st->p_pos_z[idx]);
__m256 npx = px, npy = py, npz = pz;
__m256 contact = zero;
for (int s = 0; s < MAX_SPHERES; s++) {
int sidx = c * MAX_SPHERES + s;
__m256 sx = _mm256_set1_ps(sim->sphere_pos_x[sidx]);
__m256 sy = _mm256_set1_ps(sim->sphere_pos_y[sidx]);
__m256 sz = _mm256_set1_ps(sim->sphere_pos_z[sidx]);
__m256 r = _mm256_set1_ps(sim->sphere_radius[sidx]);
__m256 dx = _mm256_sub_ps(npx, sx);
__m256 dy = _mm256_sub_ps(npy, sy);
__m256 dz = _mm256_sub_ps(npz, sz);
__m256 d2 = _mm256_fmadd_ps(dz, dz,_mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
__m256 safe_d2 = _mm256_max_ps(d2, eps);
__m256 inv_d = mm256_rsqrt_nr_ps(safe_d2);
__m256 pen = _mm256_sub_ps(_mm256_mul_ps(safe_d2, inv_d), r);
__m256 mask = _mm256_cmp_ps(pen, zero, _CMP_LT_OQ);
contact = _mm256_or_ps(contact, mask);
__m256 push = _mm256_mul_ps(pen, inv_d);
npx = _mm256_sub_ps(npx, _mm256_and_ps(_mm256_mul_ps(dx, push), mask));
npy = _mm256_sub_ps(npy, _mm256_and_ps(_mm256_mul_ps(dy, push), mask));
npz = _mm256_sub_ps(npz, _mm256_and_ps(_mm256_mul_ps(dz, push), mask));
}
__m256 prx = _mm256_load_ps(&st->p_pos_x_prev[idx]);
__m256 pry = _mm256_load_ps(&st->p_pos_y_prev[idx]);
__m256 prz = _mm256_load_ps(&st->p_pos_z_prev[idx]);
__m256 vx = _mm256_mul_ps(_mm256_sub_ps(npx, prx), inv_dt);
__m256 vy = _mm256_mul_ps(_mm256_sub_ps(npy, pry), inv_dt);
__m256 vz = _mm256_mul_ps(_mm256_sub_ps(npz, prz), inv_dt);
__m256 scale = _mm256_blendv_ps(_mm256_set1_ps(1.0f), _mm256_sub_ps(_mm256_set1_ps(1.0f), friction), contact);
vx = _mm256_mul_ps(vx, scale);
vy = _mm256_mul_ps(vy, scale);
vz = _mm256_mul_ps(vz, scale);
prx = _mm256_sub_ps(npx, _mm256_mul_ps(vx, dt_v));
pry = _mm256_sub_ps(npy, _mm256_mul_ps(vy, dt_v));
prz = _mm256_sub_ps(npz, _mm256_mul_ps(vz, dt_v));
__m256 fix = _mm256_cmp_ps(_mm256_load_ps(&st->p_rmass[idx]), zero, _CMP_EQ_OQ);
npx = _mm256_blendv_ps(npx, px, fix);
npy = _mm256_blendv_ps(npy, py, fix);
npz = _mm256_blendv_ps(npz, pz, fix);
_mm256_store_ps(&st->p_pos_x[idx], npx)
_mm256_store_ps(&st->p_pos_y[idx], npy);
_mm256_store_ps(&st->p_pos_z[idx], npz);
_mm256_store_ps(&st->p_pos_x_prev[idx], prx);
_mm256_store_ps(&st->p_pos_y_prev[idx], pry);
_mm256_store_ps(&st->p_pos_z_prev[idx], prz);
}
}
// -----------------------------------------------------------------------------
// Public API
// -----------------------------------------------------------------------------
extern void
cloth_pre_batch_update(struct cloth_sim* restrict sim, float dt) {
SIM_ASSERT(sim != NULL);
SIM_ASSERT(dt > EPSILON);
for (int c = 0; c < MAX_CLOTHS; c += 8) {
compute_batch_setup(sim, c, dt);
}
}
extern void
cloth_solve_range(struct cloth_sim* restrict sim, float dt, int start_cloth, int end_cloth) {
SIM_ASSERT(sim != NULL);
SIM_ASSERT(dt > EPSILON);
SIM_ASSERT(start_cloth >= 0 && end_cloth <= MAX_CLOTHS);
for (int c = start_cloth; c < end_cloth; c++) {
SIM_VALIDATE(sim, c);
compute_wind_forces(sim, c, dt);
for (int i = 0; i < MAX_PARTICLES; i += PARTICLES_PER_BATCH) {
integrate_batch(sim, c, i, dt);
}
const float sub_dt = dt / SUBSTEPS;
for (int s = 0; s < SUBSTEPS; s++) {
compute_distance_constraints(sim, c, sub_dt);
compute_tether_constraints(sim, c, sub_dt);
for (int i = 0; i < MAX_PARTICLES; i += PARTICLES_PER_BATCH) {
compute_collisions_and_finalize(sim, c, i, sub_dt);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment