vurtun · December 30, 2025 22:14 · vurtun · Dec 19, 2025
diff --git a/chain.c b/chain.c
 /*
 * ======================================================================================
 *  XPBD CHAIN SIMULATION LIBRARY
 * ======================================================================================
 *  OVERVIEW:
 *  Physics simulation designed specifically for high-fidelity decorative chains
 *  in AAA games (jewelry, hair braids, lantern holders, armor attachments).
 *  It prioritizes visual stability, artistic control, and extreme CPU efficiency.
 *
 *  ALGORITHM & ARCHITECTURE:
 *  - Core Solver: Extended Position Based Dynamics (XPBD). Ensures infinite stiffness
 *    stability and frame-rate independence via compliance (alpha / dt^2).
 *  - Memory Layout: "Transposed Structure-of-Arrays". Particles are stored in interleaved
 *    Even/Odd arrays, allowing 8-wide AVX operations on a single chain without scatter/gather.
 *  - Pipeline: Hybrid "Blocked" execution. Global forces are computed 8-chains-wide (Parallel),
 *    while constraint solving is performed per-chain (Sequential) but instruction-interleaved
 *    to maximize register utilization and hide ALU latency.
 *  - Precision: Uses Newton-Raphson refinement for inverse square roots to prevent
 *    instability in angle constraints at high tension.
 *
 *  KEY FEATURES:
 *  1. Art-Directable Curvature:
 *     True Cosine-based angle constraints allow chains to hold a natural curl
 *     (e.g., gold necklaces, stiff cables) via `rest_curvature`.
 *  2. Root Motion Blending:
 *     Configurable inertia blend prevents the "floating/lagging" artifact common in
 *     character accessories. Chains can inherit parent acceleration seamlessly.
 *  3. Robust Collision & Friction:
 *     - Sphere-based collision with "Velocity Kill" friction logic.
 *     - Prevents "jitter" when chains rest on surfaces or get pinched.
 *     - Handles spawn-inside-collider edge cases gracefully.
 *  4. Environmental Interaction:
 *     - World-space Wind and Gravity support.
 *     - Hard Motion Radius constraints to prevent clipping through macro-geometry.
 *
 *  PERFORMANCE CHARACTERISTICS:
 *  - Zero dynamic allocation (Stack/Static memory only).
 *  - Optimized for L1 Cache residency.
 *  - Inner solver loop fits entirely within 16 YMM registers (Zero stack spilling).
 *  - Throughput: Capable of simulating hundreds of chains in < 0.5ms on modern CPUs.
 *
 *  CONSTRAINTS & LIMITATIONS:
 *  - Hardware: Requires CPU support for AVX2 and FMA3 (Intel Haswell+, AMD Zen+).
 *  - Topology: Strictly linear chains (no branching, no cloth grids).
 *  - Capacity: Fixed compile-time limits (128 chains, 16 particles/chain).
 *    (Optimized for 16-particle chains; fewer is possible but wastes SIMD lanes).
 *
 *  USAGE:
 *  1. Call chn_sim_init() once.
 *  2. Add chains via chn_sim_add().
 *  3. Update root transforms/colliders per frame via chn_sim_mov/chn_sim_set_sph.
 *  4. Call chn_sim_run() with a fixed or variable delta time.
 *
 * ======================================================================================
 */
 #include <immintrin.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>

 /* --- Compiler Feature Macros --- */
 #if defined(_MSC_VER)
  #define ALWAYS_INLINE __forceinline
  #define RESTRICT __restrict
  #define ALIGN_VAR(x) __declspec(align(x))
  #define UNROLL_HINT __pragma(loop(unroll))
 #elif defined(__GNUC__) || defined(__clang__)
  #define ALWAYS_INLINE __attribute__((always_inline)) inline
  #define RESTRICT __restrict__
  #define ALIGN_VAR(x) __attribute__((aligned(x)))
  #define UNROLL_HINT _Pragma("GCC unroll 2")
 #else
  #define ALWAYS_INLINE inline
  #define RESTRICT
  #define ALIGN_VAR(x)
  #define UNROLL_HINT
 #endif

 /* --- Constants --- */
 #define MAX_MOTION_RADIUS 10000.0f
 enum {
  MAX_CHNS      = 128,
  PTC_PER_CHN   = 16,
  SPH_PER_CHN   = 8,
  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
  CON_PER_CHN   = PTC_PER_CHN,
  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
  MAX_ITR       = 16,
 };
 struct chn_cfg {
  float damping;
  float stiffness;
  float friction;
  float linear_inertia;
  float angular_inertia;
  float centrifugal_inertia;
  float bend_stiffness;
  float rest_curvature;
  float _pad[8]; // Ensure alignment for SoA
 } ALIGN_VAR(32);

 struct chn_sim {
  unsigned char free_idx_cnt;
  unsigned char free_idx[MAX_CHNS];

  // World Transforms (Anchor Point)
  float chn_pos_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_pos_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_pos_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_w[MAX_CHNS] ALIGN_VAR(32);

  float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32);

  float chn_grav_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_grav_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_grav_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_wind_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_wind_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_wind_z[MAX_CHNS] ALIGN_VAR(32);
  struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32);

  // Local Particles
  float ptc_pos_x[MAX_PTC] ALIGN_VAR(32);
  float ptc_pos_y[MAX_PTC] ALIGN_VAR(32);
  float ptc_pos_z[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32);
  float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32);

  // Motion Radius (Hard Constraints)
  float rest_pos_x[MAX_PTC] ALIGN_VAR(32);
  float rest_pos_y[MAX_PTC] ALIGN_VAR(32);
  float rest_pos_z[MAX_PTC] ALIGN_VAR(32);
  float motion_radius[MAX_PTC] ALIGN_VAR(32);

  // Constraints
  float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32);
  float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32);
  float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32);

  // Colliders
  float sph_x[MAX_SPH] ALIGN_VAR(32);
  float sph_y[MAX_SPH] ALIGN_VAR(32);
  float sph_z[MAX_SPH] ALIGN_VAR(32);
  float sph_r[MAX_SPH] ALIGN_VAR(32);
 };
 static int
 chn_sim__val(struct chn_sim *cns) {
  assert(cns);
  assert(cns->free_idx_cnt <= MAX_CHNS);
  return 1;
 }
 extern void
 chn_sim_init(struct chn_sim *cns) {
  assert(cns);
  memset(cns, 0, sizeof(*cns));
  cns->free_idx_cnt = MAX_CHNS;
  for (int i = 0; i < MAX_CHNS; ++i) {
    cns->free_idx[i] = MAX_CHNS - i - 1;
    cns->chn_quat_w[i] = 1.0f;
    cns->chn_prev_quat_w[i] = 1.0f;
  }
  for (int i = 0; i < MAX_PTC; ++i) {
    cns->motion_radius[i] = MAX_MOTION_RADIUS;
  }
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) {
  assert(cns && cfg && rest_pos);
  assert(cns->free_idx_cnt > 0);
  int chn = cns->free_idx[--cns->free_idx_cnt];
  cns->chn_cfg[chn] = *cfg;

  int p_base = (chn * PTC_PER_CHN);
  int r_base = (chn * CON_PER_CHN);
  for (int i = 0; i < PTC_PER_CHN; ++i) {
    int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
    int physical_i = p_base + i;
    if (logical_i < cnt) {
      cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0];
      cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1];
      cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2];
      cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3];
      cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
    } else {
      cns->ptc_pos_x[physical_i] = 0.0f;
      cns->ptc_pos_y[physical_i] = 0.0f;
      cns->ptc_pos_z[physical_i] = 0.0f;
      cns->ptc_inv_mass[physical_i] = 0.0f;
      cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
    }
    cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
    cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
    cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i];

    cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
    cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
    cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
  }
  for (int i = 0; i < PTC_PER_CHN; ++i) {
    int physical_r = r_base + i;
    cns->ptc_rest_len[physical_r] = 0.0f;
    cns->ptc_lambda[physical_r] = 0.0f;
    cns->ptc_bend_lambda[physical_r] = 0.0f;

    int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
    if (logical_con_start + 1 < cnt) {
      float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0];
      float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1];
      float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2];
      float len = sqrtf(dx*dx + dy*dy + dz*dz);
      cns->ptc_rest_len[physical_r] = (len > 1e-6f) ? len : 1e-6f;
    }
  }
  return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
  assert(chn >= 0 && chn < MAX_CHNS);
  int p_base = (chn * PTC_PER_CHN);
  int r_base = (chn * CON_PER_CHN);
  int s_base = (chn * SPH_PER_CHN);

  cns->free_idx[cns->free_idx_cnt++] = chn;
  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));

  cns->chn_quat_x[chn] = 0.0f;
  cns->chn_quat_y[chn] = 0.0f;
  cns->chn_quat_z[chn] = 0.0f;
  cns->chn_quat_w[chn] = 1.0f;

  cns->chn_prev_quat_x[chn] = 0.0f;
  cns->chn_prev_quat_y[chn] = 0.0f;
  cns->chn_prev_quat_z[chn] = 0.0f;
  cns->chn_prev_quat_w[chn] = 1.0f;

  memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float));
  for (int i = 0; i < PTC_PER_CHN; ++i) {
    cns->motion_radius[p_base + i] = MAX_MOTION_RADIUS;
  }
  memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
  cns->chn_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
  chn_sim_mov(cns, chn, pos3, rot4);
  cns->chn_prev_pos_x[chn] = pos3[0];
  cns->chn_prev_pos_y[chn] = pos3[1];
  cns->chn_prev_pos_z[chn] = pos3[2];

  cns->chn_prev_quat_x[chn] = rot4[0];
  cns->chn_prev_quat_y[chn] = rot4[1];
  cns->chn_prev_quat_z[chn] = rot4[2];
  cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) {
  cns->chn_grav_x[chn] = grav3[0];
  cns->chn_grav_y[chn] = grav3[1];
  cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) {
  cns->chn_wind_x[chn] = wind3[0];
  cns->chn_wind_y[chn] = wind3[1];
  cns->chn_wind_z[chn] = wind3[2];
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) {
  int s_base = chn * SPH_PER_CHN;
  for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) {
    cns->sph_x[s_base + i] = spheres[i*4+0];
    cns->sph_y[s_base + i] = spheres[i*4+1];
    cns->sph_z[s_base + i] = spheres[i*4+2];
    cns->sph_r[s_base + i] = spheres[i*4+3];
  }
  for (int i = cnt; i < SPH_PER_CHN; ++i) {
    cns->sph_r[s_base + i] = 0.0f;
  }
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) {
  int p_base = chn * PTC_PER_CHN;
  for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) {
    int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8));
    cns->motion_radius[physical_i] = radius[i];
  }
 }
 static ALWAYS_INLINE void
 quat_mul_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry,
                __m256 *RESTRICT rz, __m256 *RESTRICT rw, 
                __m256 x1, __m256 y1, __m256 z1, __m256 w1, 
                __m256 x2, __m256 y2, __m256 z2, __m256 w2) {
  
  // 1. Compute Cross Product terms (Q1.xyz x Q2.xyz)
  __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(y1, z2), _mm256_mul_ps(z1, y2));
  __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(z1, x2), _mm256_mul_ps(x1, z2));
  __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(x1, y2), _mm256_mul_ps(y1, x2));

  // 2. Compute Dot Product term (Q1.xyz . Q2.xyz)
  __m256 dot_x = _mm256_mul_ps(x1, x2);
  __m256 dot_y = _mm256_mul_ps(y1, y2);
  __m256 dot_z = _mm256_mul_ps(z1, z2);
  __m256 dot_sum = _mm256_add_ps(dot_x, _mm256_add_ps(dot_y, dot_z));

  // 3. Compute W scale terms
  __m256 w1x2 = _mm256_mul_ps(w1, x2);
  __m256 x1w2 = _mm256_mul_ps(x1, w2);
  
  __m256 w1y2 = _mm256_mul_ps(w1, y2);
  __m256 y1w2 = _mm256_mul_ps(y1, w2);

  __m256 w1z2 = _mm256_mul_ps(w1, z2);
  __m256 z1w2 = _mm256_mul_ps(z1, w2);

  // 4. Final Assembly
  // X = (w1*x2 + x1*w2) - cross_x
  *rx = _mm256_sub_ps(_mm256_add_ps(w1x2, x1w2), cross_x);
  // Y = (w1*y2 + y1*w2) - cross_y
  *ry = _mm256_sub_ps(_mm256_add_ps(w1y2, y1w2), cross_y);
  // Z = (w1*z2 + z1*w2) - cross_z
  *rz = _mm256_sub_ps(_mm256_add_ps(w1z2, z1w2), cross_z);
  // W = (w1*w2) - dot_sum
  *rw = _mm256_sub_ps(_mm256_mul_ps(w1, w2), dot_sum);
 }
 static ALWAYS_INLINE void
 rot_vec_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, 
               __m256 qx, __m256 qy, __m256 qz, __m256 qw, 
               __m256 vx, __m256 vy, __m256 vz) {
  
  __m256 two = _mm256_set1_ps(2.0f);

  // 1. Calculate Q.xyz x V
  __m256 q_cross_v_x = _mm256_sub_ps(_mm256_mul_ps(qy, vz), _mm256_mul_ps(qz, vy));
  __m256 q_cross_v_y = _mm256_sub_ps(_mm256_mul_ps(qz, vx), _mm256_mul_ps(qx, vz));
  __m256 q_cross_v_z = _mm256_sub_ps(_mm256_mul_ps(qx, vy), _mm256_mul_ps(qy, vx));

  // 2. Calculate T = 2 * (Q.xyz x V)
  __m256 tx = _mm256_mul_ps(two, q_cross_v_x);
  __m256 ty = _mm256_mul_ps(two, q_cross_v_y);
  __m256 tz = _mm256_mul_ps(two, q_cross_v_z);

  // 3. Calculate Q.xyz x T
  __m256 q_cross_t_x = _mm256_sub_ps(_mm256_mul_ps(qy, tz), _mm256_mul_ps(qz, ty));
  __m256 q_cross_t_y = _mm256_sub_ps(_mm256_mul_ps(qz, tx), _mm256_mul_ps(qx, tz));
  __m256 q_cross_t_z = _mm256_sub_ps(_mm256_mul_ps(qx, ty), _mm256_mul_ps(qy, tx));

  // 4. Calculate Q.w * T
  __m256 w_times_t_x = _mm256_mul_ps(qw, tx);
  __m256 w_times_t_y = _mm256_mul_ps(qw, ty);
  __m256 w_times_t_z = _mm256_mul_ps(qw, tz);

  // 5. Final Result: V + (Q.w * T) + (Q.xyz x T)
  *rx = _mm256_add_ps(vx, _mm256_add_ps(w_times_t_x, q_cross_t_x));
  *ry = _mm256_add_ps(vy, _mm256_add_ps(w_times_t_y, q_cross_t_y));
  *rz = _mm256_add_ps(vz, _mm256_add_ps(w_times_t_z, q_cross_t_z));
 }
 static ALWAYS_INLINE void
 rot_vec_inv_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, 
                   __m256 qx, __m256 qy, __m256 qz, __m256 qw, 
                   __m256 vx, __m256 vy, __m256 vz) {

  // Conjugate the quaternion (-x, -y, -z, w)
  __m256 neg_qx = _mm256_sub_ps(_mm256_setzero_ps(), qx);
  __m256 neg_qy = _mm256_sub_ps(_mm256_setzero_ps(), qy);
  __m256 neg_qz = _mm256_sub_ps(_mm256_setzero_ps(), qz);

  // Perform standard rotation using the conjugate
  rot_vec_batch8(rx, ry, rz, neg_qx, neg_qy, neg_qz, qw, vx, vy, vz);
 }
 static ALWAYS_INLINE void
 solve_dist(__m256 *RESTRICT p1x, __m256 *RESTRICT p1y, __m256 *RESTRICT p1z,
           __m256 invm1, __m256 *RESTRICT p2x,
           __m256 *RESTRICT p2y, __m256 *RESTRICT p2z,
           __m256 invm2, __m256 rest_len, __m256 compliance,
           __m256 *RESTRICT lambda, __m256 eps) {

  __m256 dx = _mm256_sub_ps(*p1x, *p2x);
  __m256 dy = _mm256_sub_ps(*p1y, *p2y);
  __m256 dz = _mm256_sub_ps(*p1z, *p2z);
  __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));

  __m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps));
  __m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(),
    _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(d2, inv_d), rest_len),
    _mm256_mul_ps(compliance, *lambda))),
    _mm256_add_ps(_mm256_add_ps(invm1, invm2), compliance));
  dL = _mm256_and_ps(dL, _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ));
  *lambda = _mm256_add_ps(*lambda, dL);

  __m256 px = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL);
  __m256 py = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL);
  __m256 pz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL);

  *p1x = _mm256_fmadd_ps(px, invm1, *p1x);
  *p1y = _mm256_fmadd_ps(py, invm1, *p1y);
  *p1z = _mm256_fmadd_ps(pz, invm1, *p1z);

  *p2x = _mm256_fnmadd_ps(px, invm2, *p2x);
  *p2y = _mm256_fnmadd_ps(py, invm2, *p2y);
  *p2z = _mm256_fnmadd_ps(pz, invm2, *p2z);
 }
 static ALWAYS_INLINE void
 solve_angle(__m256 RESTRICT* p0x, __m256 RESTRICT* p0y, __m256 RESTRICT *p0z,
            __m256 invm0, __m256 *RESTRICT p1x, __m256 *RESTRICT p1y,
            __m256 *RESTRICT p1z, __m256 invm1, __m256 *RESTRICT p2x,
            __m256 *RESTRICT p2y, __m256 *RESTRICT p2z, __m256 invm2,
            __m256 compliance, __m256 *RESTRICT lambda, __m256 target_cos,
            __m256 mask, __m256 eps) {

  __m256 e1x = _mm256_sub_ps(*p1x, *p0x);
  __m256 e1y = _mm256_sub_ps(*p1y, *p0y);
  __m256 e1z = _mm256_sub_ps(*p1z, *p0z);
  __m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z)));
  __m256 i1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps));

  __m256 e2x = _mm256_sub_ps(*p2x, *p1x);
  __m256 e2y = _mm256_sub_ps(*p2y, *p1y);
  __m256 e2z = _mm256_sub_ps(*p2z, *p1z);
  __m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z)));
  __m256 i2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps));

  __m256 n1x = _mm256_mul_ps(e1x, i1);
  __m256 n1y = _mm256_mul_ps(e1y, i1);
  __m256 n1z = _mm256_mul_ps(e1z, i1);

  __m256 n2x = _mm256_mul_ps(e2x, i2);
  __m256 n2y = _mm256_mul_ps(e2y, i2);
  __m256 n2z = _mm256_mul_ps(e2z, i2);

  __m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z)));
  __m256 C = _mm256_sub_ps(dot, target_cos);

  __m256 gp0x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1x, n2x), i1);
  __m256 gp0y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1y, n2y), i1);
  __m256 gp0z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1z, n2z), i1);

  __m256 gp2x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2x, n1x), _mm256_sub_ps(_mm256_setzero_ps(), i2));
  __m256 gp2y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2y, n1y), _mm256_sub_ps(_mm256_setzero_ps(), i2));
  __m256 gp2z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2z, n1z), _mm256_sub_ps(_mm256_setzero_ps(), i2));

  __m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x));
  __m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y));
  __m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z));

  __m256 w = _mm256_add_ps(_mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z)))),
             _mm256_add_ps(_mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z)))),
             _mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z))))));
  __m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), _mm256_add_ps(w, compliance));
  dL = _mm256_and_ps(dL, mask); // CRITICAL: Mask out the delta lambda for the wrap-around lane

  *lambda = _mm256_add_ps(*lambda, dL);
  *p0x = _mm256_fmadd_ps(gp0x, _mm256_mul_ps(dL, invm0), *p0x);
  *p0y = _mm256_fmadd_ps(gp0y, _mm256_mul_ps(dL, invm0), *p0y);
  *p0z = _mm256_fmadd_ps(gp0z, _mm256_mul_ps(dL, invm0), *p0z);

  *p1x = _mm256_fmadd_ps(gp1x, _mm256_mul_ps(dL, invm1), *p1x);
  *p1y = _mm256_fmadd_ps(gp1y, _mm256_mul_ps(dL, invm1), *p1y);
  *p1z = _mm256_fmadd_ps(gp1z, _mm256_mul_ps(dL, invm1), *p1z);

  *p2x = _mm256_fmadd_ps(gp2x, _mm256_mul_ps(dL, invm2), *p2x);
  *p2y = _mm256_fmadd_ps(gp2y, _mm256_mul_ps(dL, invm2), *p2y);
  *p2z = _mm256_fmadd_ps(gp2z, _mm256_mul_ps(dL, invm2), *p2z);
 }
 /* --- Simulation Loop --- */
 extern void
 chn_sim_run(struct chn_sim *RESTRICT cns, float dt) {
  const __m256 dt2_v = _mm256_set1_ps(dt * dt);
  const __m256 eps_v = _mm256_set1_ps(1e-7f);
  const __m256 zero_v = _mm256_setzero_ps();
  const __m256i sl = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
  const __m256i sr = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);

  ALIGN_VAR(32) float b_fx[8], b_fy[8], b_fz[8], b_vx[8], b_vy[8], b_vz[8], b_qrx[8], b_qry[8], b_qrz[8], b_qrw[8];
  for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) {
    /* --- PHASE 1: Batched Anchor Updates --- */
    {
      __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]);
      __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]);
      __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]);
      __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]);

      __m256 pqx = _mm256_load_ps(&cns->chn_prev_quat_x[c_blk]);
      __m256 pqy = _mm256_load_ps(&cns->chn_prev_quat_y[c_blk]);
      __m256 pqz = _mm256_load_ps(&cns->chn_prev_quat_z[c_blk]);
      __m256 pqw = _mm256_load_ps(&cns->chn_prev_quat_w[c_blk]);

      __m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]);
      __m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]);
      __m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]);

      __m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]);
      __m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]);
      __m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]);

      __m256 wfx, wfy, wfz;
      rot_vec_inv_batch8(&wfx, &wfy, &wfz, qx, qy, qz, qw, _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_x[c_blk]), _mm256_load_ps(&cns->chn_wind_x[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_y[c_blk]), _mm256_load_ps(&cns->chn_wind_y[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_z[c_blk]), _mm256_load_ps(&cns->chn_wind_z[c_blk])));
      _mm256_store_ps(b_fx, wfx);
      _mm256_store_ps(b_fy, wfy);
      _mm256_store_ps(b_fz, wfz);

      __m256 lvx, lvy, lvz; rot_vec_inv_batch8(&lvx, &lvy, &lvz, qx, qy, qz, qw, _mm256_sub_ps(cx, px), _mm256_sub_ps(cy, py), _mm256_sub_ps(cz, pz));
      _mm256_store_ps(b_vx, lvx);
      _mm256_store_ps(b_vy, lvy);
      _mm256_store_ps(b_vz, lvz);

      __m256 qrx, qry, qrz, qrw;
      quat_mul_batch8(&qrx, &qry, &qrz, &qrw, _mm256_sub_ps(zero_v, qx), _mm256_sub_ps(zero_v, qy), _mm256_sub_ps(zero_v, qz), qw, pqx, pqy, pqz, pqw);

      _mm256_store_ps(b_qrx, qrx);
      _mm256_store_ps(b_qry, qry);
      _mm256_store_ps(b_qrz, qrz);
      _mm256_store_ps(b_qrw, qrw);

      _mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx);
      _mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy);
      _mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz);

      _mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx);
      _mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy);
      _mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz);
      _mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw);
    }
    /* --- PHASE 2: Local Chain Simulation --- */
    for (int i = 0; i < 8; ++i) {
      int c = c_blk + i;
      int p_base = c * PTC_PER_CHN, r_base = c * CON_PER_CHN;
      struct chn_cfg *cfg = &cns->chn_cfg[c];

      __m256 damp = _mm256_set1_ps(cfg->damping);
      __m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v));
      __m256 b_compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_v));
      __m256 t_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f));

      __m256 qrx = _mm256_set1_ps(b_qrx[i]);
      __m256 qry = _mm256_set1_ps(b_qry[i]);
      __m256 qrz = _mm256_set1_ps(b_qrz[i]);
      __m256 qrw = _mm256_set1_ps(b_qrw[i]);
      __m256 o_sq = _mm256_add_ps(_mm256_mul_ps(qrx, qrx), _mm256_add_ps(_mm256_mul_ps(qry, qry), _mm256_mul_ps(qrz, qrz)));

      __m256 lv_x = _mm256_set1_ps(b_vx[i]);
      __m256 lv_y = _mm256_set1_ps(b_vy[i]);
      __m256 lv_z = _mm256_set1_ps(b_vz[i]);

      __m256 i_l = _mm256_set1_ps(cfg->linear_inertia);
      __m256 i_a = _mm256_set1_ps(cfg->angular_inertia);
      __m256 i_c = _mm256_set1_ps(cfg->centrifugal_inertia);

      __m256 p_pos[3][2], p_prev[3][2];
      __m256 inv_m[2], rl[2], lm[2], blm[2], r_pos[3][2], m_rad[2];

      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        int off = p_base + k*8;
        int r_off = r_base + k*8;

        p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]);
        p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]);
        p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]);

        p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]);
        p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]);
        p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]);

        inv_m[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]);
        rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]);
        lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]);
        blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]);

        r_pos[0][k] = _mm256_load_ps(&cns->rest_pos_x[off]);
        r_pos[1][k] = _mm256_load_ps(&cns->rest_pos_y[off]);
        r_pos[2][k] = _mm256_load_ps(&cns->rest_pos_z[off]);
        m_rad[k] = _mm256_load_ps(&cns->motion_radius[off]);
      }
      /* --- Predict + Inertia --- */
      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        __m256 dyn = _mm256_cmp_ps(inv_m[k], zero_v, _CMP_GT_OQ);
        __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
        __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
        __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);

        vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(lv_x, i_l), dyn));
        vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(lv_y, i_l), dyn));
        vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(lv_z, i_l), dyn));

        __m256 rax, ray, raz;
        rot_vec_batch8(&rax, &ray, &raz, qrx, qry, qrz, qrw, p_pos[0][k], p_pos[1][k], p_pos[2][k]);
        vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[0][k], rax), i_a), dyn));
        vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[1][k], ray), i_a), dyn));
        vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[2][k], raz), i_a), dyn));

        vx = _mm256_add_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[0][k], o_sq), i_c), dyn));
        vy = _mm256_add_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[1][k], o_sq), i_c), dyn));
        vz = _mm256_add_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[2][k], o_sq), i_c), dyn));

        p_prev[0][k] = p_pos[0][k];
        p_prev[1][k] = p_pos[1][k];
        p_prev[2][k] = p_pos[2][k];

        __m256 fx = _mm256_set1_ps(b_fx[i]);
        __m256 fy = _mm256_set1_ps(b_fy[i]);
        __m256 fz = _mm256_set1_ps(b_fz[i]);

        p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(fx, inv_m[k]), dt2_v, _mm256_fmadd_ps(vx, damp, p_pos[0][k]));
        p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(fy, inv_m[k]), dt2_v, _mm256_fmadd_ps(vy, damp, p_pos[1][k]));
        p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(fz, inv_m[k]), dt2_v, _mm256_fmadd_ps(vz, damp, p_pos[2][k]));
      }
      /* --- XPBD Solver Iterations --- */
      for (int it = 0; it < MAX_ITR; ++it) {
        {
          // Even(i) <-> Odd(i)
          __m256 p2x = p_pos[0][1];
          __m256 p2y = p_pos[1][1];
          __m256 p2z = p_pos[2][1];
          __m256 p2m = inv_m[1];
          solve_dist(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
            &p2x, &p2y, &p2z, p2m, rl[0], compl, &lm[0], eps_v);

          // Direct write-back for k=0
          p_pos[0][1] = p2x;
          p_pos[1][1] = p2y;
          p_pos[2][1] = p2z;
        }
        {
          // Odd(i) <-> Shifted Even(i+1)
          __m256 p2x = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
          __m256 p2y = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
          __m256 p2z = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
          __m256 p2m = _mm256_permutevar8x32_ps(inv_m[0], sl);

          __m256 o2x = p2x;
          __m256 o2y = p2y;
          __m256 o2z = p2z;
          solve_dist(&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
            &p2x, &p2y, &p2z, p2m, rl[1], compl, &lm[1], eps_v);

          // Shifted accumulation for k=1
          p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2x, o2x), sr));
          p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2y, o2y), sr));
          p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2z, o2z), sr));
        }
        // Bending (Group 1: Odd Center)
        {
          __m256 v_b = _mm256_and_ps(_mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[1], zero_v, _CMP_GT_OQ));
          __m256 rx = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
          __m256 ry = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
          __m256 rz = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
          __m256 rm = _mm256_permutevar8x32_ps(inv_m[0], sl);

          solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
            &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], &rx, &ry, &rz, rm,
            b_compl, &blm[1], t_cos, v_b, eps_v);

          p_pos[0][0] = _mm256_blendv_ps(p_pos[0][0], _mm256_permutevar8x32_ps(rx, sr), v_b);
          p_pos[1][0] = _mm256_blendv_ps(p_pos[1][0], _mm256_permutevar8x32_ps(ry, sr), v_b);
          p_pos[2][0] = _mm256_blendv_ps(p_pos[2][0], _mm256_permutevar8x32_ps(rz, sr), v_b);
        }
        // Bending (Group 2: Even Center)
        {
          __m256 v_b = _mm256_and_ps(_mm256_cmp_ps(_mm256_permutevar8x32_ps(rl[1], sr), zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ));
          __m256 lx = _mm256_permutevar8x32_ps(p_pos[0][1], sr);
          __m256 ly = _mm256_permutevar8x32_ps(p_pos[1][1], sr);
          __m256 lz = _mm256_permutevar8x32_ps(p_pos[2][1], sr);
          __m256 lmm = _mm256_permutevar8x32_ps(inv_m[1], sr);

          solve_angle(&lx, &ly, &lz, lmm,
            &p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
            &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
            b_compl, &blm[0], t_cos, v_b, eps_v);

          p_pos[0][1] = _mm256_blendv_ps(p_pos[0][1], _mm256_permutevar8x32_ps(lx, sl), v_b);
          p_pos[1][1] = _mm256_blendv_ps(p_pos[1][1], _mm256_permutevar8x32_ps(ly, sl), v_b);
          p_pos[2][1] = _mm256_blendv_ps(p_pos[2][1], _mm256_permutevar8x32_ps(lz, sl), v_b);
        }
      }
      /* --- Collisions --- */
      int s_base = c * SPH_PER_CHN;
      const __m256 f_coef = _mm256_set1_ps(cfg->friction);
      for (int s = 0; s < SPH_PER_CHN; ++s) {
        __m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]);
        __m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]);
        __m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]);
        __m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]);

        UNROLL_HINT
        for (int k = 0; k < 2; ++k) {
          __m256 dx = _mm256_sub_ps(p_pos[0][k], sx);
          __m256 dy = _mm256_sub_ps(p_pos[1][k], sy);
          __m256 dz = _mm256_sub_ps(p_pos[2][k], sz);
          __m256 d2 = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));

          // 1. Calculate safe distance
          __m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_v));
          __m256 dist = _mm256_mul_ps(d2, id);

          // 2. Only calculate push if there is actual penetration and sr > 0
          __m256 active_sph = _mm256_cmp_ps(sr, eps_v, _CMP_GT_OQ);
          __m256 colliding = _mm256_cmp_ps(dist, sr, _CMP_LT_OQ);
          __m256 mask = _mm256_and_ps(active_sph, colliding);

          __m256 pen = _mm256_sub_ps(sr, dist);
          __m256 push = _mm256_and_ps(pen, mask);

          // 3. Normal vector (zeroed if not colliding to prevent NaN propagation)
          __m256 nx = _mm256_mul_ps(dx, id);
          __m256 ny = _mm256_mul_ps(dy, id);
          __m256 nz = _mm256_mul_ps(dz, id);

          p_pos[0][k] = _mm256_add_ps(p_pos[0][k], _mm256_mul_ps(nx, push));
          p_pos[1][k] = _mm256_add_ps(p_pos[1][k], _mm256_mul_ps(ny, push));
          p_pos[2][k] = _mm256_add_ps(p_pos[2][k], _mm256_mul_ps(nz, push));

          // 4. Friction (Stable Step-by-Step)
          __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
          __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
          __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
          __m256 dot = _mm256_add_ps(_mm256_mul_ps(vx, nx), _mm256_add_ps(_mm256_mul_ps(vy, ny), _mm256_mul_ps(vz, nz)));

          // Tangent Velocity = Total Velocity - Normal Velocity
          __m256 tx = _mm256_sub_ps(vx, _mm256_mul_ps(dot, nx));
          __m256 ty = _mm256_sub_ps(vy, _mm256_mul_ps(dot, ny));
          __m256 tz = _mm256_sub_ps(vz, _mm256_mul_ps(dot, nz));
          __m256 f_scale = _mm256_mul_ps(push, f_coef);

          // Apply friction only where colliding
          p_pos[0][k] = _mm256_sub_ps(p_pos[0][k], _mm256_and_ps(_mm256_mul_ps(tx, f_scale), mask));
          p_pos[1][k] = _mm256_sub_ps(p_pos[1][k], _mm256_and_ps(_mm256_mul_ps(ty, f_scale), mask));
          p_pos[2][k] = _mm256_sub_ps(p_pos[2][k], _mm256_and_ps(_mm256_mul_ps(tz, f_scale), mask));
        }
      }
      /* --- Motion Radius Guardrail --- */
      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        __m256 dx = _mm256_sub_ps(p_pos[0][k], r_pos[0][k]);
        __m256 dy = _mm256_sub_ps(p_pos[1][k], r_pos[1][k]);
        __m256 dz = _mm256_sub_ps(p_pos[2][k], r_pos[2][k]);

        __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
        __m256 dist = _mm256_sqrt_ps(d2);
        __m256 mask = _mm256_cmp_ps(dist, m_rad[k], _CMP_GT_OQ);
        __m256 scale = _mm256_div_ps(m_rad[k], _mm256_max_ps(dist, eps_v));

        p_pos[0][k] = _mm256_blendv_ps(p_pos[0][k], _mm256_fmadd_ps(dx, scale, r_pos[0][k]), mask);
        p_pos[1][k] = _mm256_blendv_ps(p_pos[1][k], _mm256_fmadd_ps(dy, scale, r_pos[1][k]), mask);
        p_pos[2][k] = _mm256_blendv_ps(p_pos[2][k], _mm256_fmadd_ps(dz, scale, r_pos[2][k]), mask);
      }
      /* --- Final Store --- */
      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        int off = p_base + k*8, r_off = r_base + k*8;
        _mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]);
        _mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]);
        _mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]);

        _mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]);
        _mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]);
        _mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]);
        _mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]);
        _mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]);
      }
    }
  }
 }
diff --git a/main.c b/main.c
 #include "raylib.h"
 #include "raymath.h"
 #include <stdio.h>
 #include <math.h>

 static Vector3
 GetParticleWorldPos(struct chn_sim* sim, int chain_id, int particle_idx) {
  int p_base = chain_id * PTC_PER_CHN;
  int phys_idx = (particle_idx % 2 == 0) ?
    (p_base + particle_idx/2) :
    (p_base + 8 + particle_idx/2);

  float lx = sim->ptc_pos_x[phys_idx];
  float ly = sim->ptc_pos_y[phys_idx];
  float lz = sim->ptc_pos_z[phys_idx];

  // Manual Vector3 x Quaternion rotation (v' = v + 2 * cross(q.xyz, cross(q.xyz, v) + q.w * v))
  float qx = sim->chn_quat_x[chain_id];
  float qy = sim->chn_quat_y[chain_id];
  float qz = sim->chn_quat_z[chain_id];
  float qw = sim->chn_quat_w[chain_id];

  float tx = 2.0f * (qy * lz - qz * ly);
  float ty = 2.0f * (qz * lx - qx * lz);
  float tz = 2.0f * (qx * ly - qy * lx);

  float rx = lx + qw * tx + (qy * tz - qz * ty);
  float ry = ly + qw * ty + (qz * tx - qx * tz);
  float rz = lz + qw * tz + (qx * ty - qy * tx);

  return (Vector3){
    sim->chn_pos_x[chain_id] + rx,
    sim->chn_pos_y[chain_id] + ry,
    sim->chn_pos_z[chain_id] + rz
  };
 }
 extern int
 main(void) {
  const int screenWidth = 1280;
  const int screenHeight = 720;
  InitWindow(screenWidth, screenHeight, "Enshrouded Chain Physics - Diagnostic");
  SetTargetFPS(60);

  Camera3D camera = { 0 };
  camera.position = (Vector3){ 0.0f, 2.0f, 5.0f };
  camera.target = (Vector3){ 0.0f, 0.5f, 0.0f };
  camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
  camera.fovy = 45.0f;
  camera.projection = CAMERA_PERSPECTIVE;

  struct chn_sim sim;
  chn_sim_init(&sim);

  struct chn_cfg cfg = {
    .damping = 0.99f,
    .stiffness = 0.5f,
    .bend_stiffness = 10.0f,
    .friction = 0.5f,
    .linear_inertia = 0.1f,
    .rest_curvature = 0.0f
  };
  // Create chain
  float local_rest_pos[16 * 4];
  for (int i = 0; i < 16; ++i) {
    local_rest_pos[i*4 + 0] = 0.0f;
    local_rest_pos[i*4 + 1] = -(0.15f * i); // Offset downwards from root
    local_rest_pos[i*4 + 2] = 0.0f;
    local_rest_pos[i*4 + 3] = (i==0) ? 0.0f : 1.0f; // Root fixed at 0,0,0 local
  }
  chn_sim_add(&sim, &cfg, local_rest_pos, 16);
  float grav[] = { 0.0f, -9.81f, 0.0f };
  chn_sim_grav(&sim, 0, grav);

  // Set initial world location
  float p_init[] = { 0.0f, 2.0f, 0.0f }; // Start at height 2.0
  float r_init[] = { 0, 0, 0, 1 };
  chn_sim_tel(&sim, 0, p_init, r_init);

  Vector3 rootPos = {0.0f, 2.0f, 0.0f};
  Vector3 spherePos = { 0.0f, 0.8f, 0.2f };
  float sphereRadius = 0.7f;
  while (!WindowShouldClose()) {
    if (IsKeyPressed(KEY_Q)) {
      break;
    }
    float dt = 0.016f; // Fixed DT for stable debugging
    if (IsKeyDown(KEY_UP)) {
      rootPos.z -= 4.0f * dt;
    }
    if (IsKeyDown(KEY_DOWN)) {
      rootPos.z += 4.0f * dt;
    }
    if (IsKeyDown(KEY_LEFT)) {
      rootPos.x -= 4.0f * dt;
    }
    if (IsKeyDown(KEY_RIGHT)) {
      rootPos.x += 4.0f * dt;
    }
    // Move Sphere
    if (IsKeyDown(KEY_W)) {
      spherePos.z -= 2.0f * dt;
    }
    if (IsKeyDown(KEY_S)) {
      spherePos.z += 2.0f * dt;
    }
    if (IsKeyDown(KEY_A)) {
      spherePos.x -= 2.0f * dt;
    }
    if (IsKeyDown(KEY_D)){
      spherePos.x += 2.0f * dt;
    }
    UpdateCamera(&camera, CAMERA_ORBITAL);

    // 1. Move the World Space Root and pass movement to the simulator
    rootPos.x += sinf(GetTime() * 2.0f) * 0.005f;
    float p[] = { rootPos.x, rootPos.y, rootPos.z };
    float r[] = { 0, 0, 0, 1 }; // Identity rotation for now
    chn_sim_mov(&sim, 0, p, r);

    // 3. Update Colliders (MUST be in Local Space relative to rootPos)
    float local_sph[] = {
      spherePos.x - rootPos.x,
      spherePos.y - rootPos.y,
      spherePos.z - rootPos.z,
      sphereRadius
    };
    chn_sim_set_sph(&sim, 0, local_sph, 1);
    chn_sim_run(&sim, dt);

    BeginDrawing();
    ClearBackground(DARKGRAY);
    BeginMode3D(camera);
      DrawGrid(10, 1.0f);
      DrawSphere(spherePos, sphereRadius, Fade(RED, 0.3f));
      for(int i=0; i<15; i++) {
        Vector3 p1 = GetParticleWorldPos(&sim, 0, i);
        Vector3 p2 = GetParticleWorldPos(&sim, 0, i+1);
        DrawCapsule(p1, p2, 0.02f, 4, 4, GOLD);
      }
    EndMode3D();
    DrawFPS(10, 10);
    EndDrawing();
  }
  CloseWindow();
  return 0;
 }
	#include "raylib.h"
	#include "raymath.h"
	#include <stdio.h>
	#include <math.h>

	static Vector3
	GetParticleWorldPos(struct chn_sim* sim, int chain_id, int particle_idx) {
	int p_base = chain_id * PTC_PER_CHN;
	int phys_idx = (particle_idx % 2 == 0) ?
	(p_base + particle_idx/2) :
	(p_base + 8 + particle_idx/2);

	float lx = sim->ptc_pos_x[phys_idx];
	float ly = sim->ptc_pos_y[phys_idx];
	float lz = sim->ptc_pos_z[phys_idx];

	// Manual Vector3 x Quaternion rotation (v' = v + 2 * cross(q.xyz, cross(q.xyz, v) + q.w * v))
	float qx = sim->chn_quat_x[chain_id];
	float qy = sim->chn_quat_y[chain_id];
	float qz = sim->chn_quat_z[chain_id];
	float qw = sim->chn_quat_w[chain_id];

	float tx = 2.0f * (qy * lz - qz * ly);
	float ty = 2.0f * (qz * lx - qx * lz);
	float tz = 2.0f * (qx * ly - qy * lx);

	float rx = lx + qw * tx + (qy * tz - qz * ty);
	float ry = ly + qw * ty + (qz * tx - qx * tz);
	float rz = lz + qw * tz + (qx * ty - qy * tx);

	return (Vector3){
	sim->chn_pos_x[chain_id] + rx,
	sim->chn_pos_y[chain_id] + ry,
	sim->chn_pos_z[chain_id] + rz
	};
	}
	extern int
	main(void) {
	const int screenWidth = 1280;
	const int screenHeight = 720;
	InitWindow(screenWidth, screenHeight, "Enshrouded Chain Physics - Diagnostic");
	SetTargetFPS(60);

	Camera3D camera = { 0 };
	camera.position = (Vector3){ 0.0f, 2.0f, 5.0f };
	camera.target = (Vector3){ 0.0f, 0.5f, 0.0f };
	camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
	camera.fovy = 45.0f;
	camera.projection = CAMERA_PERSPECTIVE;

	struct chn_sim sim;
	chn_sim_init(&sim);

	struct chn_cfg cfg = {
	.damping = 0.99f,
	.stiffness = 0.5f,
	.bend_stiffness = 10.0f,
	.friction = 0.5f,
	.linear_inertia = 0.1f,
	.rest_curvature = 0.0f
	};
	// Create chain
	float local_rest_pos[16 * 4];
	for (int i = 0; i < 16; ++i) {
	local_rest_pos[i*4 + 0] = 0.0f;
	local_rest_pos[i4 + 1] = -(0.15f i); // Offset downwards from root
	local_rest_pos[i*4 + 2] = 0.0f;
	local_rest_pos[i*4 + 3] = (i==0) ? 0.0f : 1.0f; // Root fixed at 0,0,0 local
	}
	chn_sim_add(&sim, &cfg, local_rest_pos, 16);
	float grav[] = { 0.0f, -9.81f, 0.0f };
	chn_sim_grav(&sim, 0, grav);

	// Set initial world location
	float p_init[] = { 0.0f, 2.0f, 0.0f }; // Start at height 2.0
	float r_init[] = { 0, 0, 0, 1 };
	chn_sim_tel(&sim, 0, p_init, r_init);

	Vector3 rootPos = {0.0f, 2.0f, 0.0f};
	Vector3 spherePos = { 0.0f, 0.8f, 0.2f };
	float sphereRadius = 0.7f;
	while (!WindowShouldClose()) {
	if (IsKeyPressed(KEY_Q)) {
	break;
	}
	float dt = 0.016f; // Fixed DT for stable debugging
	if (IsKeyDown(KEY_UP)) {
	rootPos.z -= 4.0f * dt;
	}
	if (IsKeyDown(KEY_DOWN)) {
	rootPos.z += 4.0f * dt;
	}
	if (IsKeyDown(KEY_LEFT)) {
	rootPos.x -= 4.0f * dt;
	}
	if (IsKeyDown(KEY_RIGHT)) {
	rootPos.x += 4.0f * dt;
	}
	// Move Sphere
	if (IsKeyDown(KEY_W)) {
	spherePos.z -= 2.0f * dt;
	}
	if (IsKeyDown(KEY_S)) {
	spherePos.z += 2.0f * dt;
	}
	if (IsKeyDown(KEY_A)) {
	spherePos.x -= 2.0f * dt;
	}
	if (IsKeyDown(KEY_D)){
	spherePos.x += 2.0f * dt;
	}
	UpdateCamera(&camera, CAMERA_ORBITAL);

	// 1. Move the World Space Root and pass movement to the simulator
	rootPos.x += sinf(GetTime() * 2.0f) * 0.005f;
	float p[] = { rootPos.x, rootPos.y, rootPos.z };
	float r[] = { 0, 0, 0, 1 }; // Identity rotation for now
	chn_sim_mov(&sim, 0, p, r);

	// 3. Update Colliders (MUST be in Local Space relative to rootPos)
	float local_sph[] = {
	spherePos.x - rootPos.x,
	spherePos.y - rootPos.y,
	spherePos.z - rootPos.z,
	sphereRadius
	};
	chn_sim_set_sph(&sim, 0, local_sph, 1);
	chn_sim_run(&sim, dt);

	BeginDrawing();
	ClearBackground(DARKGRAY);
	BeginMode3D(camera);
	DrawGrid(10, 1.0f);
	DrawSphere(spherePos, sphereRadius, Fade(RED, 0.3f));
	for(int i=0; i<15; i++) {
	Vector3 p1 = GetParticleWorldPos(&sim, 0, i);
	Vector3 p2 = GetParticleWorldPos(&sim, 0, i+1);
	DrawCapsule(p1, p2, 0.02f, 4, 4, GOLD);
	}
	EndMode3D();
	DrawFPS(10, 10);
	EndDrawing();
	}
	CloseWindow();
	return 0;
	}
No results found