vurtun · December 31, 2025 15:36
diff --git a/cloth.c b/cloth.c
 /*
 * -----------------------------------------------------------------------------
 * High-Performance Hard-Realtime Cloth Simulation
 * -----------------------------------------------------------------------------
 * DESCRIPTION:
 *   This module implements a production-hardened CPU cloth 
 *   solver designed for AA/AAA open-world game. It utilizes 
 *   NvCloth cooking data (topology) but replaces the runtime with a highly 
 *   optimized, cache-aware, and predictable AVX2 implementation.
 *
 *   Target Performance: < 3.0ms total frame time for 128 active cloths on 
 *   mainstream hardware (Ryzen 3000 / Intel 8th Gen), scaling linearly with 
 *   core count.
 *
 * PHYSICS MODEL:
 *   - Framework: XPBD (Extended Position Based Dynamics) with Graph-Colored 
 *     Gauss-Seidel solving.
 *   - Compliance: Stiffness is timestep-invariant (alpha = 1 / (k * dt^2)).
 *   - Aerodynamics: Triangle-based Drag & Lift model with occlusion approximation.
 *   - Coupling: Full Rigid Body coupling including Centrifugal, Euler, and 
 *     Coriolis fictitious forces.
 *   - Constraints: Distance (with compression/stretch limits), Unilateral Tethers 
 *     (long-range attachment safety), and Sphere Collisions.
 *
 * ARCHITECTURAL DESIGN DECISIONS:
 *   1. HYBRID MEMORY LAYOUT (SoA + AoS):
 *      - Batch Data (SoA): Rigid body transforms and global forces are stored in 
 *        Structure-of-Arrays to allow processing 8 cloths simultaneously using 
 *        pure AVX2 registers.
 *      - Solver Data (AoS): Particle state is stored in Per-Cloth Structs 
 *        (~15KB). This fits entirely in L1 Cache, making random access (Gather) 
 *        extremely fast (~4 cycles) effectively neutralizing scatter/gather cost.
 *
 *   2. HARD REAL-TIME GUARANTEES:
 *      - Zero Memory Allocation: All memory is statically defined. No malloc/free.
 *      - Zero Variance: All loops (particles, tethers, colliders) have fixed 
 *        bounds. There are no dynamic branches based on active constraints. 
 *        This ensures a "flat line" in profiling, critical for consistent frame pacing.
 *
 *   3. SOLVER STRATEGY (Gauss-Seidel):
 *      - Unlike Jacobi solvers, this implementation applies position corrections 
 *        immediately within the solver loop. This requires Graph Coloring (Phases) 
 *        to be thread-safe, but results in significantly faster convergence 
 *        (tighter cloth) than Jacobi.
 *
 * USAGE:
 *   1. Call `cloth_pre_batch_update(dt)` on the main thread (Very fast).
 *   2. Dispatch `cloth_solve_range(..., start, end)` across worker threads 
 *      via the Job System.
 *
 * CONSTRAINTS:
 *   - Max Cloths: 128
 *   - Max Particles per Cloth: 256 (Simulation mesh, skin visual mesh to this)
 *   - SIMD: Requires AVX2 (Haswell+).
 *   - Code Style: MISRA-C compliant (restrict, const correctness, no hidden loops).
 * -----------------------------------------------------------------------------
 */
 #include <immintrin.h>
 #include <string.h>
 #include <assert.h>
 #include <math.h>
 #include <stdalign.h>
 #include <stdint.h>

 // -----------------------------------------------------------------------------
 // Configuration & Constants (Target: < 3ms for 128 Cloths on 8th Gen HW)
 // -----------------------------------------------------------------------------
 // 128 active cloths. 128 / 8 = 16 AVX2 batches.
 #define MAX_CLOTHS        128
 // 256 vertices (16x16 grid).
 // Struct size: ~15KB (Fits entirely in L1 Cache per cloth).
 #define MAX_PARTICLES     256
 #define MAX_TRIS          768
 #define MAX_CONSTRAINTS   2048
 #define MAX_PHASES        16
 #define MAX_FABRICS       128
 #define MAX_SPHERES       16
 #define MAX_ANCHORS       256
 #define MAX_TETHERS       256

 // -----------------------------------------------------------------------------
 // Alignment & Tuning
 // -----------------------------------------------------------------------------
 #define ALIGNMENT 32
 #define EPSILON 0.0001f

 #define PARTICLES_PER_BATCH 64
 #define CONSTRAINT_BATCH_SIZE 256
 #define TRIANGLE_BATCH_SIZE 256
 #define SUBSTEPS 4

 // Runtime validation macros (Zero overhead in Release)
 #ifndef NDEBUG
 #define SIM_ASSERT(x) assert(x)
 #define SIM_VALIDATE(s, c) validate_state(s, c)
 // Forward declaration
 struct cloth_sim;
 static void validate_state(const struct cloth_sim* sim, int c);
 #else
 #define SIM_ASSERT(x) ((void)0)
 #define SIM_VALIDATE(s, c) ((void)0)
 #endif

 // -----------------------------------------------------------------------------
 // Data Structures
 // -----------------------------------------------------------------------------
 struct cloth_cfg {
  float motion_stiffness;
  float phases_stiffness;
  float phases_compress_limit;
  float phases_stretch_limit;
  float drag_coeff;
  float lift_coeff;
  float wind_density;
  float wind_friction;
  float tether_stiffness;
  float tether_length_scale;
  float damping_coeff;
  float linear_inertia[3];
  float angular_inertia[3];
  float centrifugal_inertia[3];
 } __attribute__((aligned(32)));

 struct cloth_fabric {
  alignas(32) int tri_i0[MAX_TRIS];
  alignas(32) int tri_i1[MAX_TRIS];
  alignas(32) int tri_i2[MAX_TRIS];
  alignas(32) int idx0[MAX_CONSTRAINTS];
  alignas(32) int idx1[MAX_CONSTRAINTS];
  alignas(32) float rest_vals[MAX_CONSTRAINTS];
  alignas(32) float stiff_vals[MAX_CONSTRAINTS];
  alignas(32) int anchors[MAX_ANCHORS];
  alignas(32) float tether_lens[MAX_TETHERS];
  alignas(32) int particle_idx[MAX_TETHERS];
  alignas(32) int phase_idx[MAX_PHASES];
  alignas(32) int sets[MAX_PHASES];
 };
 struct cloth_batch_data {
  alignas(32) float mdl_pos_x[MAX_CLOTHS];
  alignas(32) float mdl_pos_y[MAX_CLOTHS];
  alignas(32) float mdl_pos_z[MAX_CLOTHS];
  
  alignas(32) float mdl_rot_x[MAX_CLOTHS];
  alignas(32) float mdl_rot_y[MAX_CLOTHS];
  alignas(32) float mdl_rot_z[MAX_CLOTHS];
  alignas(32) float mdl_rot_w[MAX_CLOTHS];

  alignas(32) float prev_mdl_pos_x[MAX_CLOTHS];
  alignas(32) float prev_mdl_pos_y[MAX_CLOTHS];
  alignas(32) float prev_mdl_pos_z[MAX_CLOTHS];

  alignas(32) float prev_mdl_rot_x[MAX_CLOTHS];
  alignas(32) float prev_mdl_rot_y[MAX_CLOTHS];
  alignas(32) float prev_mdl_rot_z[MAX_CLOTHS];
  alignas(32) float prev_mdl_rot_w[MAX_CLOTHS];

  alignas(32) float local_gravity_x[MAX_CLOTHS];
  alignas(32) float local_gravity_y[MAX_CLOTHS];
  alignas(32) float local_gravity_z[MAX_CLOTHS];

  alignas(32) float local_wind_x[MAX_CLOTHS];
  alignas(32) float local_wind_y[MAX_CLOTHS];
  alignas(32) float local_wind_z[MAX_CLOTHS];

  alignas(32) float lin_vel_x[MAX_CLOTHS];
  alignas(32) float lin_vel_y[MAX_CLOTHS];
  alignas(32) float lin_vel_z[MAX_CLOTHS];

  alignas(32) float ang_vel_x[MAX_CLOTHS];
  alignas(32) float ang_vel_y[MAX_CLOTHS];
  alignas(32) float ang_vel_z[MAX_CLOTHS];
 };
 struct cloth_instance_state {
  alignas(32) float p_pos_x[MAX_PARTICLES];
  alignas(32) float p_pos_y[MAX_PARTICLES];
  alignas(32) float p_pos_z[MAX_PARTICLES];
  alignas(32) float p_rmass[MAX_PARTICLES];

  alignas(32) float p_pos_x_prev[MAX_PARTICLES];
  alignas(32) float p_pos_y_prev[MAX_PARTICLES];
  alignas(32) float p_pos_z_prev[MAX_PARTICLES];

  alignas(32) float p_rest_x[MAX_PARTICLES];
  alignas(32) float p_rest_y[MAX_PARTICLES];
  alignas(32) float p_rest_z[MAX_PARTICLES];
  alignas(32) float p_rest_w[MAX_PARTICLES]; 

  alignas(32) float p_wnd_frc_x[MAX_PARTICLES];
  alignas(32) float p_wnd_frc_y[MAX_PARTICLES];
  alignas(32) float p_wnd_frc_z[MAX_PARTICLES];
 };
 struct cloth_sim {
  int fab_idx[MAX_CLOTHS];
  struct cloth_cfg cfgs[MAX_CLOTHS];
  struct cloth_fabric fabs[MAX_FABRICS];
  
  // data layout split for optimal access patterns
  struct cloth_batch_data batch_data;
  struct cloth_instance_state cloths[MAX_CLOTHS];

  // colliders
  alignas(32) float sphere_pos_x[MAX_CLOTHS * MAX_SPHERES];
  alignas(32) float sphere_pos_y[MAX_CLOTHS * MAX_SPHERES];
  alignas(32) float sphere_pos_z[MAX_CLOTHS * MAX_SPHERES];
  alignas(32) float sphere_radius[MAX_CLOTHS * MAX_SPHERES];

  alignas(32) float global_gravity[4];
  alignas(32) float global_wind[4];
 } __attribute__((aligned(64)));
 _Static_assert(sizeof(struct cloth_instance_state) % 32 == 0, "Instance state misalignment");

 // -----------------------------------------------------------------------------
 // Helper Intrinsics
 // -----------------------------------------------------------------------------
 static inline __attribute__((always_inline)) __m256
 mm256_rsqrt_nr_ps(__m256 x) {
  // Newton-Raphson refinement for rsqrt (Increases precision 10^-4 -> 10^-7)
  __m256 nr = _mm256_rsqrt_ps(x);
  __m256 muls = _mm256_mul_ps(_mm256_mul_ps(nr, nr), x);
  __m256 beta = _mm256_mul_ps(_mm256_sub_ps(_mm256_set1_ps(3.0f), muls), _mm256_set1_ps(0.5f));
  return _mm256_mul_ps(nr, beta);
 }
 // Rotates vector by inverse quaternion (Requires negated q.xyz inputs)
 static inline __attribute__((always_inline)) void
 avx_rotate_vector_inverse(float vx, float vy, float vz,
    __m256 nqx, __m256 nqy, __m256 nqz, __m256 qw,
    __m256* out_x, __m256* out_y, __m256* out_z) {
  const __m256 g_vx = _mm256_set1_ps(vx);
  const __m256 g_vy = _mm256_set1_ps(vy);
  const __m256 g_vz = _mm256_set1_ps(vz);
  const __m256 two  = _mm256_set1_ps(2.0f);

  __m256 tx = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqy, g_vz), _mm256_mul_ps(nqz, g_vy)));
  __m256 ty = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqz, g_vx), _mm256_mul_ps(nqx, g_vz)));
  __m256 tz = _mm256_mul_ps(two, _mm256_sub_ps(_mm256_mul_ps(nqx, g_vy), _mm256_mul_ps(nqy, g_vx)));

  *out_x = _mm256_add_ps(g_vx, _mm256_add_ps(_mm256_mul_ps(qw, tx), 
           _mm256_sub_ps(_mm256_mul_ps(nqy, tz), _mm256_mul_ps(nqz, ty))));
  *out_y = _mm256_add_ps(g_vy, _mm256_add_ps(_mm256_mul_ps(qw, ty), 
           _mm256_sub_ps(_mm256_mul_ps(nqz, tx), _mm256_mul_ps(nqx, tz))));
  *out_z = _mm256_add_ps(g_vz, _mm256_add_ps(_mm256_mul_ps(qw, tz), 
           _mm256_sub_ps(_mm256_mul_ps(nqx, ty), _mm256_mul_ps(nqy, tx))));
 }

 // -----------------------------------------------------------------------------
 // Validation (Debug)
 // -----------------------------------------------------------------------------
 #ifndef NDEBUG
 static void
 validate_state(const struct cloth_sim* sim, int c) {
  SIM_ASSERT(sim != NULL);
  SIM_ASSERT(0 <= c && c < MAX_CLOTHS);
  const struct cloth_instance_state* st = &sim->cloths[c];
  for (int i = 0; i < MAX_PARTICLES; ++i) {
    SIM_ASSERT(isfinite(st->p_pos_x[i]));
    SIM_ASSERT(isfinite(st->p_pos_y[i]));
    SIM_ASSERT(isfinite(st->p_pos_z[i]));
  }
 }
 #endif

 // -----------------------------------------------------------------------------
 // Phase 1: Batch Setup (Rigid Body Math)
 // -----------------------------------------------------------------------------
 static void
 compute_batch_setup(struct cloth_sim* restrict sim, int base_c, float dt) {
  struct cloth_batch_data* restrict bd = &sim->batch_data;
  const __m256 inv_dt = _mm256_rcp_ps(_mm256_max_ps(_mm256_set1_ps(dt), _mm256_set1_ps(EPSILON)));
  const __m256 two = _mm256_set1_ps(2.0f);
  const __m256 zero = _mm256_setzero_ps();

  __m256 px = _mm256_load_ps(&bd->mdl_pos_x[base_c]);
  __m256 py = _mm256_load_ps(&bd->mdl_pos_y[base_c]);
  __m256 pz = _mm256_load_ps(&bd->mdl_pos_z[base_c]);
  
  __m256 qx = _mm256_load_ps(&bd->mdl_rot_x[base_c]);
  __m256 qy = _mm256_load_ps(&bd->mdl_rot_y[base_c]);
  __m256 qz = _mm256_load_ps(&bd->mdl_rot_z[base_c]);
  __m256 qw = _mm256_load_ps(&bd->mdl_rot_w[base_c]);

  __m256 prev_px = _mm256_load_ps(&bd->prev_mdl_pos_x[base_c]);
  __m256 prev_py = _mm256_load_ps(&bd->prev_mdl_pos_y[base_c]);
  __m256 prev_pz = _mm256_load_ps(&bd->prev_mdl_pos_z[base_c]);

  __m256 prev_qx = _mm256_load_ps(&bd->prev_mdl_rot_x[base_c]);
  __m256 prev_qy = _mm256_load_ps(&bd->prev_mdl_rot_y[base_c]);
  __m256 prev_qz = _mm256_load_ps(&bd->prev_mdl_rot_z[base_c]);
  __m256 prev_qw = _mm256_load_ps(&bd->prev_mdl_rot_w[base_c]);

  float lin_in_x[8], lin_in_y[8], lin_in_z[8];
  float ang_in_x[8], ang_in_y[8], ang_in_z[8];
  for (int k=0; k<8; ++k) {
    const struct cloth_cfg* cfg = &sim->cfgs[base_c+k];
    lin_in_x[k] = cfg->linear_inertia[0];
    lin_in_y[k] = cfg->linear_inertia[1];
    lin_in_z[k] = cfg->linear_inertia[2];

    ang_in_x[k] = cfg->angular_inertia[0];
    ang_in_y[k] = cfg->angular_inertia[1];
    ang_in_z[k] = cfg->angular_inertia[2];
  }
  __m256 lv_x = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(px, prev_px), inv_dt), _mm256_loadu_ps(lin_in_x));
  __m256 lv_y = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(py, prev_py), inv_dt), _mm256_loadu_ps(lin_in_y));
  __m256 lv_z = _mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(pz, prev_pz), inv_dt), _mm256_loadu_ps(lin_in_z));

  _mm256_store_ps(&bd->lin_vel_x[base_c], lv_x);
  _mm256_store_ps(&bd->lin_vel_y[base_c], lv_y);
  _mm256_store_ps(&bd->lin_vel_z[base_c], lv_z);

  __m256 n_pqx = _mm256_sub_ps(zero, prev_qx);
  __m256 n_pqy = _mm256_sub_ps(zero, prev_qy);
  __m256 n_pqz = _mm256_sub_ps(zero, prev_qz);

  __m256 rx = _mm256_add_ps(_mm256_mul_ps(qw, n_pqx), _mm256_add_ps(_mm256_mul_ps(qx, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qy, n_pqz), _mm256_mul_ps(qz, n_pqy))));
  __m256 ry = _mm256_add_ps(_mm256_mul_ps(qw, n_pqy), _mm256_add_ps(_mm256_mul_ps(qy, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qz, n_pqx), _mm256_mul_ps(qx, n_pqz))));
  __m256 rz = _mm256_add_ps(_mm256_mul_ps(qw, n_pqz), _mm256_add_ps(_mm256_mul_ps(qz, prev_qw), _mm256_sub_ps(_mm256_mul_ps(qx, n_pqy), _mm256_mul_ps(qy, n_pqx))));

  __m256 scale = _mm256_mul_ps(two, inv_dt);
  _mm256_store_ps(&bd->ang_vel_x[base_c], _mm256_mul_ps(_mm256_mul_ps(rx, scale), _mm256_loadu_ps(ang_in_x)));
  _mm256_store_ps(&bd->ang_vel_y[base_c], _mm256_mul_ps(_mm256_mul_ps(ry, scale), _mm256_loadu_ps(ang_in_y)));
  _mm256_store_ps(&bd->ang_vel_z[base_c], _mm256_mul_ps(_mm256_mul_ps(rz, scale), _mm256_loadu_ps(ang_in_z)));

  __m256 nqx = _mm256_sub_ps(zero, qx);
  __m256 nqy = _mm256_sub_ps(zero, qy);
  __m256 nqz = _mm256_sub_ps(zero, qz);

  __m256 lgx, lgy, lgz;
  avx_rotate_vector_inverse(sim->global_gravity[0], sim->global_gravity[1], sim->global_gravity[2], nqx, nqy, nqz, qw, &lgx, &lgy, &lgz);
  _mm256_store_ps(&bd->local_gravity_x[base_c], lgx);
  _mm256_store_ps(&bd->local_gravity_y[base_c], lgy);
  _mm256_store_ps(&bd->local_gravity_z[base_c], lgz);

  __m256 lwx, lwy, lwz;
  avx_rotate_vector_inverse(sim->global_wind[0], sim->global_wind[1], sim->global_wind[2], nqx, nqy, nqz, qw, &lwx, &lwy, &lwz);
  _mm256_store_ps(&bd->local_wind_x[base_c], lwx);
  _mm256_store_ps(&bd->local_wind_y[base_c], lwy);
  _mm256_store_ps(&bd->local_wind_z[base_c], lwz);

  _mm256_store_ps(&bd->prev_mdl_pos_x[base_c], px);
  _mm256_store_ps(&bd->prev_mdl_pos_y[base_c], py);
  _mm256_store_ps(&bd->prev_mdl_pos_z[base_c], pz);
  _mm256_store_ps(&bd->prev_mdl_rot_x[base_c], qx);
  _mm256_store_ps(&bd->prev_mdl_rot_y[base_c], qy);
  _mm256_store_ps(&bd->prev_mdl_rot_z[base_c], qz);
  _mm256_store_ps(&bd->prev_mdl_rot_w[base_c], qw);
 }
 // -----------------------------------------------------------------------------
 // Phase 2: Per-Cloth Solver Functions
 // -----------------------------------------------------------------------------
 static void
 compute_wind_forces(struct cloth_sim* restrict sim, int c, float dt) {
  int setup_idx = sim->fab_idx[c];
  const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
  struct cloth_instance_state* restrict st = &sim->cloths[c];
  struct cloth_batch_data* restrict bd = &sim->batch_data;
  
  const __m256 inv_dt_v = _mm256_set1_ps((dt > EPSILON) ? 1.0f / dt : 0.0f);
  const __m256 third = _mm256_set1_ps(1.0f / 3.0f);
  const __m256 half = _mm256_set1_ps(0.5f);
  const __m256 eps_sq = _mm256_set1_ps(EPSILON * EPSILON);
  const __m256 drag_lift = _mm256_set1_ps(sim->cfgs[c].drag_coeff + sim->cfgs[c].lift_coeff);
  const __m256 w_rho = _mm256_set1_ps(sim->cfgs[c].wind_density);

  const __m256 wx = _mm256_set1_ps(bd->local_wind_x[c]);
  const __m256 wy = _mm256_set1_ps(bd->local_wind_y[c]);
  const __m256 wz = _mm256_set1_ps(bd->local_wind_z[c]);

  // AVX2 Clear (Fixed Loop)
  const __m256 zero_v = _mm256_setzero_ps();
  for (int i = 0; i < MAX_PARTICLES; i += 8) {
    _mm256_store_ps(&st->p_wnd_frc_x[i], zero_v);
    _mm256_store_ps(&st->p_wnd_frc_y[i], zero_v);
    _mm256_store_ps(&st->p_wnd_frc_z[i], zero_v);
  }
  for (int t_base = 0; t_base < MAX_TRIS; t_base += TRIANGLE_BATCH_SIZE) {
    for (int t_offset = 0; t_offset < TRIANGLE_BATCH_SIZE; t_offset += 8) {
      int t = t_base + t_offset;
      __m256i i0 = _mm256_loadu_si256((__m256i*)&fab->tri_i0[t]);
      __m256i i1 = _mm256_loadu_si256((__m256i*)&fab->tri_i1[t]);
      __m256i i2 = _mm256_loadu_si256((__m256i*)&fab->tri_i2[t]);

      __m256 p0x = _mm256_i32gather_ps(st->p_pos_x, i0, 4);
      __m256 p0y = _mm256_i32gather_ps(st->p_pos_y, i0, 4);
      __m256 p0z = _mm256_i32gather_ps(st->p_pos_z, i0, 4);

      __m256 p1x = _mm256_i32gather_ps(st->p_pos_x, i1, 4);
      __m256 p1y = _mm256_i32gather_ps(st->p_pos_y, i1, 4);
      __m256 p1z = _mm256_i32gather_ps(st->p_pos_z, i1, 4);

      __m256 p2x = _mm256_i32gather_ps(st->p_pos_x, i2, 4);
      __m256 p2y = _mm256_i32gather_ps(st->p_pos_y, i2, 4);
      __m256 p2z = _mm256_i32gather_ps(st->p_pos_z, i2, 4);

      __m256 pr0x = _mm256_i32gather_ps(st->p_pos_x_prev, i0, 4);
      __m256 pr0y = _mm256_i32gather_ps(st->p_pos_y_prev, i0, 4);
      __m256 pr0z = _mm256_i32gather_ps(st->p_pos_z_prev, i0, 4);

      __m256 pr1x = _mm256_i32gather_ps(st->p_pos_x_prev, i1, 4);
      __m256 pr1y = _mm256_i32gather_ps(st->p_pos_y_prev, i1, 4);
      __m256 pr1z = _mm256_i32gather_ps(st->p_pos_z_prev, i1, 4);

      __m256 pr2x = _mm256_i32gather_ps(st->p_pos_x_prev, i2, 4);
      __m256 pr2y = _mm256_i32gather_ps(st->p_pos_y_prev, i2, 4); 
      __m256 pr2z = _mm256_i32gather_ps(st->p_pos_z_prev, i2, 4);

      __m256 v0x = _mm256_mul_ps(_mm256_sub_ps(p0x, pr0x), inv_dt_v);
      __m256 v0y = _mm256_mul_ps(_mm256_sub_ps(p0y, pr0y), inv_dt_v);
      __m256 v0z = _mm256_mul_ps(_mm256_sub_ps(p0z, pr0z), inv_dt_v);

      __m256 v1x = _mm256_mul_ps(_mm256_sub_ps(p1x, pr1x), inv_dt_v);
      __m256 v1y = _mm256_mul_ps(_mm256_sub_ps(p1y, pr1y), inv_dt_v);
      __m256 v1z = _mm256_mul_ps(_mm256_sub_ps(p1z, pr1z), inv_dt_v);

      __m256 v2x = _mm256_mul_ps(_mm256_sub_ps(p2x, pr2x), inv_dt_v);
      __m256 v2y = _mm256_mul_ps(_mm256_sub_ps(p2y, pr2y), inv_dt_v);
      __m256 v2z = _mm256_mul_ps(_mm256_sub_ps(p2z, pr2z), inv_dt_v);

      __m256 tri_vx = _mm256_mul_ps(_mm256_add_ps(v0x, _mm256_add_ps(v1x, v2x)), third);
      __m256 tri_vy = _mm256_mul_ps(_mm256_add_ps(v0y, _mm256_add_ps(v1y, v2y)), third);
      __m256 tri_vz = _mm256_mul_ps(_mm256_add_ps(v0z, _mm256_add_ps(v1z, v2z)), third);

      __m256 e1x = _mm256_sub_ps(p1x, p0x);
      __m256 e1y = _mm256_sub_ps(p1y, p0y);
      __m256 e1z = _mm256_sub_ps(p1z, p0z);
      __m256 e2x = _mm256_sub_ps(p2x, p0x);
      __m256 e2y = _mm256_sub_ps(p2y, p0y);
      __m256 e2z = _mm256_sub_ps(p2z, p0z);

      __m256 nx = _mm256_fmsub_ps(e1y, e2z, _mm256_mul_ps(e1z, e2y));
      __m256 ny = _mm256_fmsub_ps(e1z, e2x, _mm256_mul_ps(e1x, e2z));
      __m256 nz = _mm256_fmsub_ps(e1x, e2y, _mm256_mul_ps(e1y, e2x));

      __m256 len_sq = _mm256_fmadd_ps(nz, nz, _mm256_fmadd_ps(ny, ny, _mm256_mul_ps(nx, nx)));
      __m256 inv_nl = mm256_rsqrt_nr_ps(_mm256_max_ps(len_sq, eps_sq));
      
      nx = _mm256_mul_ps(nx, inv_nl);
      ny = _mm256_mul_ps(ny, inv_nl);
      nz = _mm256_mul_ps(nz, inv_nl);
      __m256 area = _mm256_mul_ps(_mm256_mul_ps(_mm256_max_ps(len_sq, eps_sq), inv_nl), half);

      __m256 rvx = _mm256_sub_ps(wx, tri_vx);
      __m256 rvy = _mm256_sub_ps(wy, tri_vy);
      __m256 rvz = _mm256_sub_ps(wz, tri_vz);
      __m256 dot = _mm256_fmadd_ps(nz, rvz, _mm256_fmadd_ps(ny, rvy, _mm256_mul_ps(nx, rvx)));
      __m256 P = _mm256_mul_ps(w_rho, dot);

      __m256 fx = _mm256_mul_ps(_mm256_mul_ps(P, area), nx);
      __m256 fy = _mm256_mul_ps(_mm256_mul_ps(P, area), ny);
      __m256 fz = _mm256_mul_ps(_mm256_mul_ps(P, area), nz);

      __m256 v_sq = _mm256_fmadd_ps(rvz, rvz, _mm256_fmadd_ps(rvy, rvy, _mm256_mul_ps(rvx, rvx)));
      __m256 DL = _mm256_mul_ps(drag_lift, _mm256_mul_ps(v_sq, area));
      fx = _mm256_fmadd_ps(DL, nx, fx); fy = _mm256_fmadd_ps(DL, ny, fy); fz = _mm256_fmadd_ps(DL, nz, fz);

      __m256 fx3 = _mm256_mul_ps(fx, third);
      __m256 fy3 = _mm256_mul_ps(fy, third);
      __m256 fz3 = _mm256_mul_ps(fz, third);

      float bx[8], by[8], bz[8];
      _mm256_store_ps(bx, fx3);
      _mm256_store_ps(by, fy3);
      _mm256_store_ps(bz, fz3);

      int is0[8], is1[8], is2[8];
      _mm256_store_si256((__m256i*)is0, i0);
      _mm256_store_si256((__m256i*)is1, i1);
      _mm256_store_si256((__m256i*)is2, i2);

      for(int k=0; k<8; ++k) {
        st->p_wnd_frc_x[is0[k]] += bx[k];
        st->p_wnd_frc_y[is0[k]] += by[k];
        st->p_wnd_frc_z[is0[k]] += bz[k];

        st->p_wnd_frc_x[is1[k]] += bx[k];
        st->p_wnd_frc_y[is1[k]] += by[k];
        st->p_wnd_frc_z[is1[k]] += bz[k];

        st->p_wnd_frc_x[is2[k]] += bx[k];
        st->p_wnd_frc_y[is2[k]] += by[k];
        st->p_wnd_frc_z[is2[k]] += bz[k];
      }
    }
  }
 }
 static void
 integrate_batch(struct cloth_sim* restrict sim, int c, int base_idx, float dt) {
  struct cloth_instance_state* restrict st = &sim->cloths[c];
  struct cloth_batch_data* restrict bd = &sim->batch_data;

  const float damp = sim->cfgs[c].damping_coeff;
  const float dt_sq = dt * dt;
  
  const __m256 gx = _mm256_set1_ps(bd->local_gravity_x[c] * dt_sq);
  const __m256 gy = _mm256_set1_ps(bd->local_gravity_y[c] * dt_sq);
  const __m256 gz = _mm256_set1_ps(bd->local_gravity_z[c] * dt_sq);
  const __m256 dmp_v = _mm256_set1_ps(1.0f - damp * dt);
  const __m256 dt_sq_v = _mm256_set1_ps(dt_sq);
  const __m256 dt_v = _mm256_set1_ps(dt);
  const __m256 zero = _mm256_setzero_ps();

  const __m256 lin_dx = _mm256_set1_ps(bd->lin_vel_x[c] * dt);
  const __m256 lin_dy = _mm256_set1_ps(bd->lin_vel_y[c] * dt);
  const __m256 lin_dz = _mm256_set1_ps(bd->lin_vel_z[c] * dt);

  const __m256 avx = _mm256_set1_ps(bd->ang_vel_x[c]);
  const __m256 avy = _mm256_set1_ps(bd->ang_vel_y[c]);
  const __m256 avz = _mm256_set1_ps(bd->ang_vel_z[c]);

  const __m256 csx = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[0]);
  const __m256 csy = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[1]);
  const __m256 csz = _mm256_set1_ps(sim->cfgs[c].centrifugal_inertia[2]);

  for (int i = 0; i < PARTICLES_PER_BATCH; i += 8) {
    int idx = base_idx + i;
    __m256 px = _mm256_load_ps(&st->p_pos_x[idx]);
    __m256 py = _mm256_load_ps(&st->p_pos_y[idx]);
    __m256 pz = _mm256_load_ps(&st->p_pos_z[idx]);
    __m256 im = _mm256_load_ps(&st->p_rmass[idx]);
    __m256 fix = _mm256_cmp_ps(im, zero, _CMP_EQ_OQ);
    __m256 prx = _mm256_load_ps(&st->p_pos_x_prev[idx]);
    __m256 pry = _mm256_load_ps(&st->p_pos_y_prev[idx]);
    __m256 prz = _mm256_load_ps(&st->p_pos_z_prev[idx]);

    __m256 adx = _mm256_mul_ps(_mm256_fmsub_ps(avy, pz, _mm256_mul_ps(avz, py)), dt_v);
    __m256 ady = _mm256_mul_ps(_mm256_fmsub_ps(avz, px, _mm256_mul_ps(avx, pz)), dt_v);
    __m256 adz = _mm256_mul_ps(_mm256_fmsub_ps(avx, py, _mm256_mul_ps(avy, px)), dt_v);

    __m256 cx = _mm256_fmsub_ps(avy, pz, _mm256_mul_ps(avz, py));
    __m256 cy = _mm256_fmsub_ps(avz, px, _mm256_mul_ps(avx, pz));
    __m256 cz = _mm256_fmsub_ps(avx, py, _mm256_mul_ps(avy, px));

    __m256 cdx = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avy, cz, _mm256_mul_ps(avz, cy)), dt_v), csx);
    __m256 cdy = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avz, cx, _mm256_mul_ps(avx, cz)), dt_v), csy);
    __m256 cdz = _mm256_mul_ps(_mm256_mul_ps(_mm256_fmsub_ps(avx, cy, _mm256_mul_ps(avy, cx)), dt_v), csz);

    prx = _mm256_sub_ps(prx, _mm256_add_ps(lin_dx, _mm256_add_ps(adx, cdx)));
    pry = _mm256_sub_ps(pry, _mm256_add_ps(lin_dy, _mm256_add_ps(ady, cdy)));
    prz = _mm256_sub_ps(prz, _mm256_add_ps(lin_dz, _mm256_add_ps(adz, cdz)));

    __m256 wfx = _mm256_load_ps(&st->p_wnd_frc_x[idx]);
    __m256 wfy = _mm256_load_ps(&st->p_wnd_frc_y[idx]);
    __m256 wfz = _mm256_load_ps(&st->p_wnd_frc_z[idx]);

    __m256 vx = _mm256_mul_ps(_mm256_sub_ps(px, prx), dmp_v);
    __m256 vy = _mm256_mul_ps(_mm256_sub_ps(py, pry), dmp_v);
    __m256 vz = _mm256_mul_ps(_mm256_sub_ps(pz, prz), dmp_v);

    __m256 ax = _mm256_mul_ps(_mm256_mul_ps(wfx, im), dt_sq_v);
    __m256 ay = _mm256_mul_ps(_mm256_mul_ps(wfy, im), dt_sq_v);
    __m256 az = _mm256_mul_ps(_mm256_mul_ps(wfz, im), dt_sq_v);

    __m256 npx = _mm256_add_ps(px, _mm256_add_ps(vx, _mm256_add_ps(gx, ax)));
    __m256 npy = _mm256_add_ps(py, _mm256_add_ps(vy, _mm256_add_ps(gy, ay)));
    __m256 npz = _mm256_add_ps(pz, _mm256_add_ps(vz, _mm256_add_ps(gz, az)));

    _mm256_store_ps(&st->p_pos_x[idx], _mm256_blendv_ps(npx, px, fix));
    _mm256_store_ps(&st->p_pos_y[idx], _mm256_blendv_ps(npy, py, fix));
    _mm256_store_ps(&st->p_pos_z[idx], _mm256_blendv_ps(npz, pz, fix));

    _mm256_store_ps(&st->p_pos_x_prev[idx], px);
    _mm256_store_ps(&st->p_pos_y_prev[idx], py);
    _mm256_store_ps(&st->p_pos_z_prev[idx], pz);
  }
 }
 static void
 compute_distance_constraints(struct cloth_sim* restrict sim, int c, float dt) {
  int setup_idx = sim->fab_idx[c];
  const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
  struct cloth_instance_state* restrict st = &sim->cloths[c];

  const __m256 cmp_lim = _mm256_set1_ps(sim->cfgs[c].phases_compress_limit);
  const __m256 str_lim = _mm256_set1_ps(sim->cfgs[c].phases_stretch_limit);
  const __m256 stiff = _mm256_set1_ps(sim->cfgs[c].phases_stiffness);
  const __m256 dt_sq = _mm256_set1_ps(dt*dt);
  const __m256 zero = _mm256_setzero_ps();
  const __m256 neg_one = _mm256_set1_ps(-1.0f);
  const __m256 eps = _mm256_set1_ps(EPSILON * EPSILON);

  for (int phase = 0; phase < MAX_PHASES - 1; phase++) {
    int pid = fab->phase_idx[phase];
    int start = fab->sets[pid];
    int end = fab->sets[pid+1];
    for (int i = start; i < end; i += CONSTRAINT_BATCH_SIZE) {
      for (int j = 0; j < CONSTRAINT_BATCH_SIZE; j += 8) {
        int idx = i + j;
        __m256i i0 = _mm256_loadu_si256((__m256i*)&fab->idx0[idx]);
        __m256i i1 = _mm256_loadu_si256((__m256i*)&fab->idx1[idx]);

        __m256 px0 = _mm256_i32gather_ps(st->p_pos_x, i0, 4);
        __m256 py0 = _mm256_i32gather_ps(st->p_pos_y, i0, 4);
        __m256 pz0 = _mm256_i32gather_ps(st->p_pos_z, i0, 4);

        __m256 px1 = _mm256_i32gather_ps(st->p_pos_x, i1, 4);
        __m256 py1 = _mm256_i32gather_ps(st->p_pos_y, i1, 4);
        __m256 pz1 = _mm256_i32gather_ps(st->p_pos_z, i1, 4);

        __m256 w0 = _mm256_i32gather_ps(st->p_rmass, i0, 4);
        __m256 w1 = _mm256_i32gather_ps(st->p_rmass, i1, 4);

        __m256 rest = _mm256_loadu_ps(&fab->rest_vals[idx]);
        __m256 k_val = _mm256_loadu_ps(&fab->stiff_vals[idx]);
        k_val = _mm256_blendv_ps(stiff, k_val, _mm256_cmp_ps(k_val, zero, _CMP_NEQ_OQ));

        __m256 dx = _mm256_sub_ps(px1, px0);
        __m256 dy = _mm256_sub_ps(py1, py0);
        __m256 dz = _mm256_sub_ps(pz1, pz0);
        __m256 d2 = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
        __m256 safe_d2 = _mm256_max_ps(d2, eps);
        __m256 inv_l = mm256_rsqrt_nr_ps(safe_d2);
        __m256 len = _mm256_mul_ps(safe_d2, inv_l);

        __m256 delta = _mm256_sub_ps(len, rest);
        __m256 d_clamped = _mm256_blendv_ps(_mm256_blendv_ps(delta, _mm256_mul_ps(delta, cmp_lim), _mm256_cmp_ps(delta, zero, _CMP_LT_OQ)), _mm256_mul_ps(delta, str_lim), _mm256_cmp_ps(delta, zero, _CMP_GT_OQ));

        __m256 alpha = _mm256_div_ps(_mm256_rcp_ps(k_val), dt_sq);
        __m256 factor = _mm256_mul_ps(neg_one, _mm256_div_ps(d_clamped, _mm256_add_ps(_mm256_add_ps(w0, w1), alpha)));
        __m256 cx = _mm256_mul_ps(dx, _mm256_mul_ps(factor, inv_l));
        __m256 cy = _mm256_mul_ps(dy, _mm256_mul_ps(factor, inv_l));
        __m256 cz = _mm256_mul_ps(dz, _mm256_mul_ps(factor, inv_l));

        float bx[8], by[8], bz[8], bw0[8], bw1[8];
        _mm256_store_ps(bx, cx);
        _mm256_store_ps(by, cy);
        _mm256_store_ps(bz, cz);
        _mm256_store_ps(bw0, w0);
        _mm256_store_ps(bw1, w1);

        int bi0[8], bi1[8];
        _mm256_store_si256((__m256i*)bi0, i0);
        _mm256_store_si256((__m256i*)bi1, i1);

        for (int k=0; k<8; ++k) {
          st->p_pos_x[bi0[k]] -= bw0[k]*bx[k];
          st->p_pos_y[bi0[k]] -= bw0[k]*by[k];
          st->p_pos_z[bi0[k]] -= bw0[k]*bz[k];

          st->p_pos_x[bi1[k]] += bw1[k]*bx[k];
          st->p_pos_y[bi1[k]] += bw1[k]*by[k];
          st->p_pos_z[bi1[k]] += bw1[k]*bz[k];
        }
      }
    }
  }
 }
 static void
 compute_tether_constraints(struct cloth_sim* restrict sim, int c, float dt) {
  int setup_idx = sim->fab_idx[c];
  const struct cloth_fabric* restrict fab = &sim->fabs[setup_idx];
  struct cloth_instance_state* restrict st = &sim->cloths[c];

  const __m256 dt_sq = _mm256_set1_ps(dt * dt);
  const __m256 zero = _mm256_setzero_ps();
  const __m256 eps = _mm256_set1_ps(EPSILON * EPSILON);
  const __m256 stiffness = _mm256_set1_ps(sim->cfgs[c].tether_stiffness);
  const __m256 len_scale = _mm256_set1_ps(sim->cfgs[c].tether_length_scale);
  const __m256 alpha = _mm256_div_ps(_mm256_rcp_ps(stiffness), dt_sq);

  for (int i = 0; i < MAX_TETHERS; i += 8) {
    __m256i p_idx = _mm256_loadu_si256((__m256i*)&fab->particle_idx[i]);
    __m256i a_idx = _mm256_loadu_si256((__m256i*)&fab->anchors[i]);

    __m256 px = _mm256_i32gather_ps(st->p_pos_x, p_idx, 4);
    __m256 py = _mm256_i32gather_ps(st->p_pos_y, p_idx, 4);
    __m256 pz = _mm256_i32gather_ps(st->p_pos_z, p_idx, 4);
    __m256 w0 = _mm256_i32gather_ps(st->p_rmass, p_idx, 4);
    __m256 ax = _mm256_i32gather_ps(st->p_pos_x, a_idx, 4);
    __m256 ay = _mm256_i32gather_ps(st->p_pos_y, a_idx, 4);
    __m256 az = _mm256_i32gather_ps(st->p_pos_z, a_idx, 4);
    __m256 w1 = _mm256_i32gather_ps(st->p_rmass, a_idx, 4);

    __m256 rest = _mm256_mul_ps(_mm256_loadu_ps(&fab->tether_lens[i]), len_scale);
    __m256 dx = _mm256_sub_ps(px, ax);
    __m256 dy = _mm256_sub_ps(py, ay);
    __m256 dz = _mm256_sub_ps(pz, az);
    __m256 d2 = _mm256_fmadd_ps(dz, dz, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
    __m256 safe_d2 = _mm256_max_ps(d2, eps);
    __m256 inv_l = mm256_rsqrt_nr_ps(safe_d2);
    __m256 len = _mm256_mul_ps(safe_d2, inv_l);

    __m256 delta = _mm256_sub_ps(len, rest);
    // Unilateral: Only correct if stretched (len > rest)
    __m256 mask = _mm256_cmp_ps(delta, zero, _CMP_GT_OQ);
    __m256 w_sum = _mm256_add_ps(_mm256_add_ps(w0, w1), alpha);
    __m256 factor = _mm256_div_ps(_mm256_mul_ps(delta, _mm256_set1_ps(-1.0f)), w_sum);
    factor = _mm256_and_ps(factor, mask);

    __m256 cx = _mm256_mul_ps(dx, _mm256_mul_ps(factor, inv_l));
    __m256 cy = _mm256_mul_ps(dy, _mm256_mul_ps(factor, inv_l));
    __m256 cz = _mm256_mul_ps(dz, _mm256_mul_ps(factor, inv_l));

    float bx[8], by[8], bz[8], bw0[8], bw1[8]; 
    _mm256_store_ps(bx, cx);
    _mm256_store_ps(by, cy);
    _mm256_store_ps(bz, cz);
    _mm256_store_ps(bw0, w0);
    _mm256_store_ps(bw1, w1);

    int bi_p[8], bi_a[8];
    _mm256_store_si256((__m256i*)bi_p, p_idx);
    _mm256_store_si256((__m256i*)bi_a, a_idx);
    
    for(int k=0; k<8; ++k) {
      st->p_pos_x[bi_p[k]] -= bw0[k]*bx[k];
      st->p_pos_y[bi_p[k]] -= bw0[k]*by[k];
      st->p_pos_z[bi_p[k]] -= bw0[k]*bz[k];

      st->p_pos_x[bi_a[k]] += bw1[k]*bx[k];
      st->p_pos_y[bi_a[k]] += bw1[k]*by[k];
      st->p_pos_z[bi_a[k]] += bw1[k]*bz[k];
    }
  }
 }
 static void
 compute_collisions_and_finalize(struct cloth_sim* restrict sim, int c, int base_idx, float dt) {
  struct cloth_instance_state* restrict st = &sim->cloths[c];
  const __m256 inv_dt = _mm256_set1_ps((dt > EPSILON) ? 1.0f/dt : 0.0f);
  const __m256 dt_v = _mm256_set1_ps(dt);
  const __m256 zero = _mm256_setzero_ps();
  const __m256 eps = _mm256_set1_ps(EPSILON*EPSILON);
  const __m256 friction = _mm256_set1_ps(0.1f);

  for (int i = 0; i < PARTICLES_PER_BATCH; i += 8) {
    int idx = base_idx + i;
    __m256 px = _mm256_load_ps(&st->p_pos_x[idx]);
    __m256 py = _mm256_load_ps(&st->p_pos_y[idx]);
    __m256 pz = _mm256_load_ps(&st->p_pos_z[idx]);
    __m256 npx = px, npy = py, npz = pz;
    __m256 contact = zero;
    for (int s = 0; s < MAX_SPHERES; s++) {
      int sidx = c * MAX_SPHERES + s;
      __m256 sx = _mm256_set1_ps(sim->sphere_pos_x[sidx]);
      __m256 sy = _mm256_set1_ps(sim->sphere_pos_y[sidx]);
      __m256 sz = _mm256_set1_ps(sim->sphere_pos_z[sidx]);
      __m256 r = _mm256_set1_ps(sim->sphere_radius[sidx]);

      __m256 dx = _mm256_sub_ps(npx, sx);
      __m256 dy = _mm256_sub_ps(npy, sy);
      __m256 dz = _mm256_sub_ps(npz, sz);

      __m256 d2 = _mm256_fmadd_ps(dz, dz,_mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dx, dx)));
      __m256 safe_d2 = _mm256_max_ps(d2, eps);
      __m256 inv_d = mm256_rsqrt_nr_ps(safe_d2);
      __m256 pen = _mm256_sub_ps(_mm256_mul_ps(safe_d2, inv_d), r);
      __m256 mask = _mm256_cmp_ps(pen, zero, _CMP_LT_OQ);
      contact = _mm256_or_ps(contact, mask);
      __m256 push = _mm256_mul_ps(pen, inv_d);

      npx = _mm256_sub_ps(npx, _mm256_and_ps(_mm256_mul_ps(dx, push), mask));
      npy = _mm256_sub_ps(npy, _mm256_and_ps(_mm256_mul_ps(dy, push), mask));
      npz = _mm256_sub_ps(npz, _mm256_and_ps(_mm256_mul_ps(dz, push), mask));
    }
    __m256 prx = _mm256_load_ps(&st->p_pos_x_prev[idx]);
    __m256 pry = _mm256_load_ps(&st->p_pos_y_prev[idx]);
    __m256 prz = _mm256_load_ps(&st->p_pos_z_prev[idx]);
    __m256 vx = _mm256_mul_ps(_mm256_sub_ps(npx, prx), inv_dt);
    __m256 vy = _mm256_mul_ps(_mm256_sub_ps(npy, pry), inv_dt);
    __m256 vz = _mm256_mul_ps(_mm256_sub_ps(npz, prz), inv_dt);
    __m256 scale = _mm256_blendv_ps(_mm256_set1_ps(1.0f), _mm256_sub_ps(_mm256_set1_ps(1.0f), friction), contact);
    
    vx = _mm256_mul_ps(vx, scale);
    vy = _mm256_mul_ps(vy, scale);
    vz = _mm256_mul_ps(vz, scale);

    prx = _mm256_sub_ps(npx, _mm256_mul_ps(vx, dt_v));
    pry = _mm256_sub_ps(npy, _mm256_mul_ps(vy, dt_v));
    prz = _mm256_sub_ps(npz, _mm256_mul_ps(vz, dt_v));

    __m256 fix = _mm256_cmp_ps(_mm256_load_ps(&st->p_rmass[idx]), zero, _CMP_EQ_OQ);
    npx = _mm256_blendv_ps(npx, px, fix);
    npy = _mm256_blendv_ps(npy, py, fix);
    npz = _mm256_blendv_ps(npz, pz, fix);

    _mm256_store_ps(&st->p_pos_x[idx], npx)
    _mm256_store_ps(&st->p_pos_y[idx], npy);
    _mm256_store_ps(&st->p_pos_z[idx], npz);

    _mm256_store_ps(&st->p_pos_x_prev[idx], prx);
    _mm256_store_ps(&st->p_pos_y_prev[idx], pry);
    _mm256_store_ps(&st->p_pos_z_prev[idx], prz);
  }
 }
 // -----------------------------------------------------------------------------
 // Public API
 // -----------------------------------------------------------------------------
 extern void
 cloth_pre_batch_update(struct cloth_sim* restrict sim, float dt) {
  SIM_ASSERT(sim != NULL);
  SIM_ASSERT(dt > EPSILON);
  for (int c = 0; c < MAX_CLOTHS; c += 8) {
    compute_batch_setup(sim, c, dt);
  }
 }
 extern void
 cloth_solve_range(struct cloth_sim* restrict sim, float dt, int start_cloth, int end_cloth) {
  SIM_ASSERT(sim != NULL);
  SIM_ASSERT(dt > EPSILON);
  SIM_ASSERT(start_cloth >= 0 && end_cloth <= MAX_CLOTHS);
  for (int c = start_cloth; c < end_cloth; c++) {
    SIM_VALIDATE(sim, c);
    compute_wind_forces(sim, c, dt);
    for (int i = 0; i < MAX_PARTICLES; i += PARTICLES_PER_BATCH) {
      integrate_batch(sim, c, i, dt);
    } 
    const float sub_dt = dt / SUBSTEPS;
    for (int s = 0; s < SUBSTEPS; s++) {
      compute_distance_constraints(sim, c, sub_dt); 
      compute_tether_constraints(sim, c, sub_dt);
      for (int i = 0; i < MAX_PARTICLES; i += PARTICLES_PER_BATCH) {
        compute_collisions_and_finalize(sim, c, i, sub_dt);
      }
    }
  }
 }
No results found