Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active December 30, 2025 22:14
Show Gist options
  • Select an option

  • Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.

Select an option

Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
/*
* ======================================================================================
* XPBD CHAIN SIMULATION LIBRARY
* ======================================================================================
* OVERVIEW:
* Physics simulation designed specifically for high-fidelity decorative chains
* in AAA games (jewelry, hair braids, lantern holders, armor attachments).
* It prioritizes visual stability, artistic control, and extreme CPU efficiency.
*
* ALGORITHM & ARCHITECTURE:
* - Core Solver: Extended Position Based Dynamics (XPBD). Ensures infinite stiffness
* stability and frame-rate independence via compliance (alpha / dt^2).
* - Memory Layout: "Transposed Structure-of-Arrays". Particles are stored in interleaved
* Even/Odd arrays, allowing 8-wide AVX operations on a single chain without scatter/gather.
* - Pipeline: Hybrid "Blocked" execution. Global forces are computed 8-chains-wide (Parallel),
* while constraint solving is performed per-chain (Sequential) but instruction-interleaved
* to maximize register utilization and hide ALU latency.
* - Precision: Uses Newton-Raphson refinement for inverse square roots to prevent
* instability in angle constraints at high tension.
*
* KEY FEATURES:
* 1. Art-Directable Curvature:
* True Cosine-based angle constraints allow chains to hold a natural curl
* (e.g., gold necklaces, stiff cables) via `rest_curvature`.
* 2. Root Motion Blending:
* Configurable inertia blend prevents the "floating/lagging" artifact common in
* character accessories. Chains can inherit parent acceleration seamlessly.
* 3. Robust Collision & Friction:
* - Sphere-based collision with "Velocity Kill" friction logic.
* - Prevents "jitter" when chains rest on surfaces or get pinched.
* - Handles spawn-inside-collider edge cases gracefully.
* 4. Environmental Interaction:
* - World-space Wind and Gravity support.
* - Hard Motion Radius constraints to prevent clipping through macro-geometry.
*
* PERFORMANCE CHARACTERISTICS:
* - Zero dynamic allocation (Stack/Static memory only).
* - Optimized for L1 Cache residency.
* - Inner solver loop fits entirely within 16 YMM registers (Zero stack spilling).
* - Throughput: Capable of simulating hundreds of chains in < 0.5ms on modern CPUs.
*
* CONSTRAINTS & LIMITATIONS:
* - Hardware: Requires CPU support for AVX2 and FMA3 (Intel Haswell+, AMD Zen+).
* - Topology: Strictly linear chains (no branching, no cloth grids).
* - Capacity: Fixed compile-time limits (128 chains, 16 particles/chain).
* (Optimized for 16-particle chains; fewer is possible but wastes SIMD lanes).
*
* USAGE:
* 1. Call chn_sim_init() once.
* 2. Add chains via chn_sim_add().
* 3. Update root transforms/colliders per frame via chn_sim_mov/chn_sim_set_sph.
* 4. Call chn_sim_run() with a fixed or variable delta time.
*
* ======================================================================================
*/
#include <immintrin.h>
#include <math.h>
#include <assert.h>
#include <string.h>
/* --- Compiler Feature Macros --- */
#if defined(_MSC_VER)
#define ALWAYS_INLINE __forceinline
#define RESTRICT __restrict
#define ALIGN_VAR(x) __declspec(align(x))
#define UNROLL_HINT __pragma(loop(unroll))
#elif defined(__GNUC__) || defined(__clang__)
#define ALWAYS_INLINE __attribute__((always_inline)) inline
#define RESTRICT __restrict__
#define ALIGN_VAR(x) __attribute__((aligned(x)))
#define UNROLL_HINT _Pragma("GCC unroll 2")
#else
#define ALWAYS_INLINE inline
#define RESTRICT
#define ALIGN_VAR(x)
#define UNROLL_HINT
#endif
/* --- Constants --- */
#define MAX_MOTION_RADIUS 10000.0f
enum {
MAX_CHNS = 128,
PTC_PER_CHN = 16,
SPH_PER_CHN = 8,
MAX_PTC = (MAX_CHNS * PTC_PER_CHN),
MAX_SPH = (MAX_CHNS * SPH_PER_CHN),
CON_PER_CHN = PTC_PER_CHN,
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN),
MAX_ITR = 16,
};
struct chn_cfg {
float damping;
float stiffness;
float friction;
float linear_inertia;
float angular_inertia;
float centrifugal_inertia;
float bend_stiffness;
float rest_curvature;
float _pad[8]; // Ensure alignment for SoA
} ALIGN_VAR(32);
struct chn_sim {
unsigned char free_idx_cnt;
unsigned char free_idx[MAX_CHNS];
// World Transforms (Anchor Point)
float chn_pos_x[MAX_CHNS] ALIGN_VAR(32);
float chn_pos_y[MAX_CHNS] ALIGN_VAR(32);
float chn_pos_z[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_x[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_y[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_z[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_w[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_x[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_y[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_z[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_x[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_y[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_z[MAX_CHNS] ALIGN_VAR(32);
struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32);
// Local Particles
float ptc_pos_x[MAX_PTC] ALIGN_VAR(32);
float ptc_pos_y[MAX_PTC] ALIGN_VAR(32);
float ptc_pos_z[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32);
float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32);
// Motion Radius (Hard Constraints)
float rest_pos_x[MAX_PTC] ALIGN_VAR(32);
float rest_pos_y[MAX_PTC] ALIGN_VAR(32);
float rest_pos_z[MAX_PTC] ALIGN_VAR(32);
float motion_radius[MAX_PTC] ALIGN_VAR(32);
// Constraints
float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32);
float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32);
float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32);
// Colliders
float sph_x[MAX_SPH] ALIGN_VAR(32);
float sph_y[MAX_SPH] ALIGN_VAR(32);
float sph_z[MAX_SPH] ALIGN_VAR(32);
float sph_r[MAX_SPH] ALIGN_VAR(32);
};
static int
chn_sim__val(struct chn_sim *cns) {
assert(cns);
assert(cns->free_idx_cnt <= MAX_CHNS);
return 1;
}
extern void
chn_sim_init(struct chn_sim *cns) {
assert(cns);
memset(cns, 0, sizeof(*cns));
cns->free_idx_cnt = MAX_CHNS;
for (int i = 0; i < MAX_CHNS; ++i) {
cns->free_idx[i] = MAX_CHNS - i - 1;
cns->chn_quat_w[i] = 1.0f;
cns->chn_prev_quat_w[i] = 1.0f;
}
for (int i = 0; i < MAX_PTC; ++i) {
cns->motion_radius[i] = MAX_MOTION_RADIUS;
}
}
extern int
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) {
assert(cns && cfg && rest_pos);
assert(cns->free_idx_cnt > 0);
int chn = cns->free_idx[--cns->free_idx_cnt];
cns->chn_cfg[chn] = *cfg;
int p_base = (chn * PTC_PER_CHN);
int r_base = (chn * CON_PER_CHN);
for (int i = 0; i < PTC_PER_CHN; ++i) {
int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
int physical_i = p_base + i;
if (logical_i < cnt) {
cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0];
cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1];
cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2];
cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3];
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
} else {
cns->ptc_pos_x[physical_i] = 0.0f;
cns->ptc_pos_y[physical_i] = 0.0f;
cns->ptc_pos_z[physical_i] = 0.0f;
cns->ptc_inv_mass[physical_i] = 0.0f;
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
}
cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
}
for (int i = 0; i < PTC_PER_CHN; ++i) {
int physical_r = r_base + i;
cns->ptc_rest_len[physical_r] = 0.0f;
cns->ptc_lambda[physical_r] = 0.0f;
cns->ptc_bend_lambda[physical_r] = 0.0f;
int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
if (logical_con_start + 1 < cnt) {
float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0];
float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1];
float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2];
float len = sqrtf(dx*dx + dy*dy + dz*dz);
cns->ptc_rest_len[physical_r] = (len > 1e-6f) ? len : 1e-6f;
}
}
return chn;
}
extern void
chn_sim_del(struct chn_sim *cns, int chn) {
assert(chn >= 0 && chn < MAX_CHNS);
int p_base = (chn * PTC_PER_CHN);
int r_base = (chn * CON_PER_CHN);
int s_base = (chn * SPH_PER_CHN);
cns->free_idx[cns->free_idx_cnt++] = chn;
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
cns->chn_quat_x[chn] = 0.0f;
cns->chn_quat_y[chn] = 0.0f;
cns->chn_quat_z[chn] = 0.0f;
cns->chn_quat_w[chn] = 1.0f;
cns->chn_prev_quat_x[chn] = 0.0f;
cns->chn_prev_quat_y[chn] = 0.0f;
cns->chn_prev_quat_z[chn] = 0.0f;
cns->chn_prev_quat_w[chn] = 1.0f;
memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float));
for (int i = 0; i < PTC_PER_CHN; ++i) {
cns->motion_radius[p_base + i] = MAX_MOTION_RADIUS;
}
memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float));
}
extern void
chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
cns->chn_pos_x[chn] = pos3[0];
cns->chn_pos_y[chn] = pos3[1];
cns->chn_pos_z[chn] = pos3[2];
cns->chn_quat_x[chn] = rot4[0];
cns->chn_quat_y[chn] = rot4[1];
cns->chn_quat_z[chn] = rot4[2];
cns->chn_quat_w[chn] = rot4[3];
}
extern void
chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
chn_sim_mov(cns, chn, pos3, rot4);
cns->chn_prev_pos_x[chn] = pos3[0];
cns->chn_prev_pos_y[chn] = pos3[1];
cns->chn_prev_pos_z[chn] = pos3[2];
cns->chn_prev_quat_x[chn] = rot4[0];
cns->chn_prev_quat_y[chn] = rot4[1];
cns->chn_prev_quat_z[chn] = rot4[2];
cns->chn_prev_quat_w[chn] = rot4[3];
}
extern void
chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) {
cns->chn_grav_x[chn] = grav3[0];
cns->chn_grav_y[chn] = grav3[1];
cns->chn_grav_z[chn] = grav3[2];
}
extern void
chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) {
cns->chn_wind_x[chn] = wind3[0];
cns->chn_wind_y[chn] = wind3[1];
cns->chn_wind_z[chn] = wind3[2];
}
extern void
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) {
int s_base = chn * SPH_PER_CHN;
for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) {
cns->sph_x[s_base + i] = spheres[i*4+0];
cns->sph_y[s_base + i] = spheres[i*4+1];
cns->sph_z[s_base + i] = spheres[i*4+2];
cns->sph_r[s_base + i] = spheres[i*4+3];
}
for (int i = cnt; i < SPH_PER_CHN; ++i) {
cns->sph_r[s_base + i] = 0.0f;
}
}
extern void
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) {
int p_base = chn * PTC_PER_CHN;
for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) {
int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8));
cns->motion_radius[physical_i] = radius[i];
}
}
static ALWAYS_INLINE void
quat_mul_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry,
__m256 *RESTRICT rz, __m256 *RESTRICT rw,
__m256 x1, __m256 y1, __m256 z1, __m256 w1,
__m256 x2, __m256 y2, __m256 z2, __m256 w2) {
// 1. Compute Cross Product terms (Q1.xyz x Q2.xyz)
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(y1, z2), _mm256_mul_ps(z1, y2));
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(z1, x2), _mm256_mul_ps(x1, z2));
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(x1, y2), _mm256_mul_ps(y1, x2));
// 2. Compute Dot Product term (Q1.xyz . Q2.xyz)
__m256 dot_x = _mm256_mul_ps(x1, x2);
__m256 dot_y = _mm256_mul_ps(y1, y2);
__m256 dot_z = _mm256_mul_ps(z1, z2);
__m256 dot_sum = _mm256_add_ps(dot_x, _mm256_add_ps(dot_y, dot_z));
// 3. Compute W scale terms
__m256 w1x2 = _mm256_mul_ps(w1, x2);
__m256 x1w2 = _mm256_mul_ps(x1, w2);
__m256 w1y2 = _mm256_mul_ps(w1, y2);
__m256 y1w2 = _mm256_mul_ps(y1, w2);
__m256 w1z2 = _mm256_mul_ps(w1, z2);
__m256 z1w2 = _mm256_mul_ps(z1, w2);
// 4. Final Assembly
// X = (w1*x2 + x1*w2) - cross_x
*rx = _mm256_sub_ps(_mm256_add_ps(w1x2, x1w2), cross_x);
// Y = (w1*y2 + y1*w2) - cross_y
*ry = _mm256_sub_ps(_mm256_add_ps(w1y2, y1w2), cross_y);
// Z = (w1*z2 + z1*w2) - cross_z
*rz = _mm256_sub_ps(_mm256_add_ps(w1z2, z1w2), cross_z);
// W = (w1*w2) - dot_sum
*rw = _mm256_sub_ps(_mm256_mul_ps(w1, w2), dot_sum);
}
static ALWAYS_INLINE void
rot_vec_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz,
__m256 qx, __m256 qy, __m256 qz, __m256 qw,
__m256 vx, __m256 vy, __m256 vz) {
__m256 two = _mm256_set1_ps(2.0f);
// 1. Calculate Q.xyz x V
__m256 q_cross_v_x = _mm256_sub_ps(_mm256_mul_ps(qy, vz), _mm256_mul_ps(qz, vy));
__m256 q_cross_v_y = _mm256_sub_ps(_mm256_mul_ps(qz, vx), _mm256_mul_ps(qx, vz));
__m256 q_cross_v_z = _mm256_sub_ps(_mm256_mul_ps(qx, vy), _mm256_mul_ps(qy, vx));
// 2. Calculate T = 2 * (Q.xyz x V)
__m256 tx = _mm256_mul_ps(two, q_cross_v_x);
__m256 ty = _mm256_mul_ps(two, q_cross_v_y);
__m256 tz = _mm256_mul_ps(two, q_cross_v_z);
// 3. Calculate Q.xyz x T
__m256 q_cross_t_x = _mm256_sub_ps(_mm256_mul_ps(qy, tz), _mm256_mul_ps(qz, ty));
__m256 q_cross_t_y = _mm256_sub_ps(_mm256_mul_ps(qz, tx), _mm256_mul_ps(qx, tz));
__m256 q_cross_t_z = _mm256_sub_ps(_mm256_mul_ps(qx, ty), _mm256_mul_ps(qy, tx));
// 4. Calculate Q.w * T
__m256 w_times_t_x = _mm256_mul_ps(qw, tx);
__m256 w_times_t_y = _mm256_mul_ps(qw, ty);
__m256 w_times_t_z = _mm256_mul_ps(qw, tz);
// 5. Final Result: V + (Q.w * T) + (Q.xyz x T)
*rx = _mm256_add_ps(vx, _mm256_add_ps(w_times_t_x, q_cross_t_x));
*ry = _mm256_add_ps(vy, _mm256_add_ps(w_times_t_y, q_cross_t_y));
*rz = _mm256_add_ps(vz, _mm256_add_ps(w_times_t_z, q_cross_t_z));
}
static ALWAYS_INLINE void
rot_vec_inv_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz,
__m256 qx, __m256 qy, __m256 qz, __m256 qw,
__m256 vx, __m256 vy, __m256 vz) {
// Conjugate the quaternion (-x, -y, -z, w)
__m256 neg_qx = _mm256_sub_ps(_mm256_setzero_ps(), qx);
__m256 neg_qy = _mm256_sub_ps(_mm256_setzero_ps(), qy);
__m256 neg_qz = _mm256_sub_ps(_mm256_setzero_ps(), qz);
// Perform standard rotation using the conjugate
rot_vec_batch8(rx, ry, rz, neg_qx, neg_qy, neg_qz, qw, vx, vy, vz);
}
static ALWAYS_INLINE void
solve_dist(__m256 *RESTRICT p1x, __m256 *RESTRICT p1y, __m256 *RESTRICT p1z,
__m256 invm1, __m256 *RESTRICT p2x,
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z,
__m256 invm2, __m256 rest_len, __m256 compliance,
__m256 *RESTRICT lambda, __m256 eps) {
__m256 dx = _mm256_sub_ps(*p1x, *p2x);
__m256 dy = _mm256_sub_ps(*p1y, *p2y);
__m256 dz = _mm256_sub_ps(*p1z, *p2z);
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
__m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps));
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(),
_mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(d2, inv_d), rest_len),
_mm256_mul_ps(compliance, *lambda))),
_mm256_add_ps(_mm256_add_ps(invm1, invm2), compliance));
dL = _mm256_and_ps(dL, _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ));
*lambda = _mm256_add_ps(*lambda, dL);
__m256 px = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL);
__m256 py = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL);
__m256 pz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL);
*p1x = _mm256_fmadd_ps(px, invm1, *p1x);
*p1y = _mm256_fmadd_ps(py, invm1, *p1y);
*p1z = _mm256_fmadd_ps(pz, invm1, *p1z);
*p2x = _mm256_fnmadd_ps(px, invm2, *p2x);
*p2y = _mm256_fnmadd_ps(py, invm2, *p2y);
*p2z = _mm256_fnmadd_ps(pz, invm2, *p2z);
}
static ALWAYS_INLINE void
solve_angle(__m256 RESTRICT* p0x, __m256 RESTRICT* p0y, __m256 RESTRICT *p0z,
__m256 invm0, __m256 *RESTRICT p1x, __m256 *RESTRICT p1y,
__m256 *RESTRICT p1z, __m256 invm1, __m256 *RESTRICT p2x,
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z, __m256 invm2,
__m256 compliance, __m256 *RESTRICT lambda, __m256 target_cos,
__m256 mask, __m256 eps) {
__m256 e1x = _mm256_sub_ps(*p1x, *p0x);
__m256 e1y = _mm256_sub_ps(*p1y, *p0y);
__m256 e1z = _mm256_sub_ps(*p1z, *p0z);
__m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z)));
__m256 i1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps));
__m256 e2x = _mm256_sub_ps(*p2x, *p1x);
__m256 e2y = _mm256_sub_ps(*p2y, *p1y);
__m256 e2z = _mm256_sub_ps(*p2z, *p1z);
__m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z)));
__m256 i2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps));
__m256 n1x = _mm256_mul_ps(e1x, i1);
__m256 n1y = _mm256_mul_ps(e1y, i1);
__m256 n1z = _mm256_mul_ps(e1z, i1);
__m256 n2x = _mm256_mul_ps(e2x, i2);
__m256 n2y = _mm256_mul_ps(e2y, i2);
__m256 n2z = _mm256_mul_ps(e2z, i2);
__m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z)));
__m256 C = _mm256_sub_ps(dot, target_cos);
__m256 gp0x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1x, n2x), i1);
__m256 gp0y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1y, n2y), i1);
__m256 gp0z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1z, n2z), i1);
__m256 gp2x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2x, n1x), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp2y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2y, n1y), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp2z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2z, n1z), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x));
__m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y));
__m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z));
__m256 w = _mm256_add_ps(_mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z)))),
_mm256_add_ps(_mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z)))),
_mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z))))));
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), _mm256_add_ps(w, compliance));
dL = _mm256_and_ps(dL, mask); // CRITICAL: Mask out the delta lambda for the wrap-around lane
*lambda = _mm256_add_ps(*lambda, dL);
*p0x = _mm256_fmadd_ps(gp0x, _mm256_mul_ps(dL, invm0), *p0x);
*p0y = _mm256_fmadd_ps(gp0y, _mm256_mul_ps(dL, invm0), *p0y);
*p0z = _mm256_fmadd_ps(gp0z, _mm256_mul_ps(dL, invm0), *p0z);
*p1x = _mm256_fmadd_ps(gp1x, _mm256_mul_ps(dL, invm1), *p1x);
*p1y = _mm256_fmadd_ps(gp1y, _mm256_mul_ps(dL, invm1), *p1y);
*p1z = _mm256_fmadd_ps(gp1z, _mm256_mul_ps(dL, invm1), *p1z);
*p2x = _mm256_fmadd_ps(gp2x, _mm256_mul_ps(dL, invm2), *p2x);
*p2y = _mm256_fmadd_ps(gp2y, _mm256_mul_ps(dL, invm2), *p2y);
*p2z = _mm256_fmadd_ps(gp2z, _mm256_mul_ps(dL, invm2), *p2z);
}
/* --- Simulation Loop --- */
extern void
chn_sim_run(struct chn_sim *RESTRICT cns, float dt) {
const __m256 dt2_v = _mm256_set1_ps(dt * dt);
const __m256 eps_v = _mm256_set1_ps(1e-7f);
const __m256 zero_v = _mm256_setzero_ps();
const __m256i sl = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
const __m256i sr = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
ALIGN_VAR(32) float b_fx[8], b_fy[8], b_fz[8], b_vx[8], b_vy[8], b_vz[8], b_qrx[8], b_qry[8], b_qrz[8], b_qrw[8];
for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) {
/* --- PHASE 1: Batched Anchor Updates --- */
{
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]);
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]);
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]);
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]);
__m256 pqx = _mm256_load_ps(&cns->chn_prev_quat_x[c_blk]);
__m256 pqy = _mm256_load_ps(&cns->chn_prev_quat_y[c_blk]);
__m256 pqz = _mm256_load_ps(&cns->chn_prev_quat_z[c_blk]);
__m256 pqw = _mm256_load_ps(&cns->chn_prev_quat_w[c_blk]);
__m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]);
__m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]);
__m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]);
__m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]);
__m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]);
__m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]);
__m256 wfx, wfy, wfz;
rot_vec_inv_batch8(&wfx, &wfy, &wfz, qx, qy, qz, qw, _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_x[c_blk]), _mm256_load_ps(&cns->chn_wind_x[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_y[c_blk]), _mm256_load_ps(&cns->chn_wind_y[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_z[c_blk]), _mm256_load_ps(&cns->chn_wind_z[c_blk])));
_mm256_store_ps(b_fx, wfx);
_mm256_store_ps(b_fy, wfy);
_mm256_store_ps(b_fz, wfz);
__m256 lvx, lvy, lvz; rot_vec_inv_batch8(&lvx, &lvy, &lvz, qx, qy, qz, qw, _mm256_sub_ps(cx, px), _mm256_sub_ps(cy, py), _mm256_sub_ps(cz, pz));
_mm256_store_ps(b_vx, lvx);
_mm256_store_ps(b_vy, lvy);
_mm256_store_ps(b_vz, lvz);
__m256 qrx, qry, qrz, qrw;
quat_mul_batch8(&qrx, &qry, &qrz, &qrw, _mm256_sub_ps(zero_v, qx), _mm256_sub_ps(zero_v, qy), _mm256_sub_ps(zero_v, qz), qw, pqx, pqy, pqz, pqw);
_mm256_store_ps(b_qrx, qrx);
_mm256_store_ps(b_qry, qry);
_mm256_store_ps(b_qrz, qrz);
_mm256_store_ps(b_qrw, qrw);
_mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx);
_mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy);
_mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz);
_mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx);
_mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy);
_mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz);
_mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw);
}
/* --- PHASE 2: Local Chain Simulation --- */
for (int i = 0; i < 8; ++i) {
int c = c_blk + i;
int p_base = c * PTC_PER_CHN, r_base = c * CON_PER_CHN;
struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 damp = _mm256_set1_ps(cfg->damping);
__m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v));
__m256 b_compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_v));
__m256 t_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f));
__m256 qrx = _mm256_set1_ps(b_qrx[i]);
__m256 qry = _mm256_set1_ps(b_qry[i]);
__m256 qrz = _mm256_set1_ps(b_qrz[i]);
__m256 qrw = _mm256_set1_ps(b_qrw[i]);
__m256 o_sq = _mm256_add_ps(_mm256_mul_ps(qrx, qrx), _mm256_add_ps(_mm256_mul_ps(qry, qry), _mm256_mul_ps(qrz, qrz)));
__m256 lv_x = _mm256_set1_ps(b_vx[i]);
__m256 lv_y = _mm256_set1_ps(b_vy[i]);
__m256 lv_z = _mm256_set1_ps(b_vz[i]);
__m256 i_l = _mm256_set1_ps(cfg->linear_inertia);
__m256 i_a = _mm256_set1_ps(cfg->angular_inertia);
__m256 i_c = _mm256_set1_ps(cfg->centrifugal_inertia);
__m256 p_pos[3][2], p_prev[3][2];
__m256 inv_m[2], rl[2], lm[2], blm[2], r_pos[3][2], m_rad[2];
UNROLL_HINT
for (int k=0; k<2; ++k) {
int off = p_base + k*8;
int r_off = r_base + k*8;
p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]);
p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]);
p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]);
p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]);
p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]);
p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]);
inv_m[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]);
rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]);
lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]);
blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]);
r_pos[0][k] = _mm256_load_ps(&cns->rest_pos_x[off]);
r_pos[1][k] = _mm256_load_ps(&cns->rest_pos_y[off]);
r_pos[2][k] = _mm256_load_ps(&cns->rest_pos_z[off]);
m_rad[k] = _mm256_load_ps(&cns->motion_radius[off]);
}
/* --- Predict + Inertia --- */
UNROLL_HINT
for (int k=0; k<2; ++k) {
__m256 dyn = _mm256_cmp_ps(inv_m[k], zero_v, _CMP_GT_OQ);
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(lv_x, i_l), dyn));
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(lv_y, i_l), dyn));
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(lv_z, i_l), dyn));
__m256 rax, ray, raz;
rot_vec_batch8(&rax, &ray, &raz, qrx, qry, qrz, qrw, p_pos[0][k], p_pos[1][k], p_pos[2][k]);
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[0][k], rax), i_a), dyn));
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[1][k], ray), i_a), dyn));
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[2][k], raz), i_a), dyn));
vx = _mm256_add_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[0][k], o_sq), i_c), dyn));
vy = _mm256_add_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[1][k], o_sq), i_c), dyn));
vz = _mm256_add_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[2][k], o_sq), i_c), dyn));
p_prev[0][k] = p_pos[0][k];
p_prev[1][k] = p_pos[1][k];
p_prev[2][k] = p_pos[2][k];
__m256 fx = _mm256_set1_ps(b_fx[i]);
__m256 fy = _mm256_set1_ps(b_fy[i]);
__m256 fz = _mm256_set1_ps(b_fz[i]);
p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(fx, inv_m[k]), dt2_v, _mm256_fmadd_ps(vx, damp, p_pos[0][k]));
p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(fy, inv_m[k]), dt2_v, _mm256_fmadd_ps(vy, damp, p_pos[1][k]));
p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(fz, inv_m[k]), dt2_v, _mm256_fmadd_ps(vz, damp, p_pos[2][k]));
}
/* --- XPBD Solver Iterations --- */
for (int it = 0; it < MAX_ITR; ++it) {
{
// Even(i) <-> Odd(i)
__m256 p2x = p_pos[0][1];
__m256 p2y = p_pos[1][1];
__m256 p2z = p_pos[2][1];
__m256 p2m = inv_m[1];
solve_dist(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p2x, &p2y, &p2z, p2m, rl[0], compl, &lm[0], eps_v);
// Direct write-back for k=0
p_pos[0][1] = p2x;
p_pos[1][1] = p2y;
p_pos[2][1] = p2z;
}
{
// Odd(i) <-> Shifted Even(i+1)
__m256 p2x = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
__m256 p2y = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
__m256 p2z = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
__m256 p2m = _mm256_permutevar8x32_ps(inv_m[0], sl);
__m256 o2x = p2x;
__m256 o2y = p2y;
__m256 o2z = p2z;
solve_dist(&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
&p2x, &p2y, &p2z, p2m, rl[1], compl, &lm[1], eps_v);
// Shifted accumulation for k=1
p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2x, o2x), sr));
p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2y, o2y), sr));
p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2z, o2z), sr));
}
// Bending (Group 1: Odd Center)
{
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[1], zero_v, _CMP_GT_OQ));
__m256 rx = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
__m256 ry = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
__m256 rz = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
__m256 rm = _mm256_permutevar8x32_ps(inv_m[0], sl);
solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], &rx, &ry, &rz, rm,
b_compl, &blm[1], t_cos, v_b, eps_v);
p_pos[0][0] = _mm256_blendv_ps(p_pos[0][0], _mm256_permutevar8x32_ps(rx, sr), v_b);
p_pos[1][0] = _mm256_blendv_ps(p_pos[1][0], _mm256_permutevar8x32_ps(ry, sr), v_b);
p_pos[2][0] = _mm256_blendv_ps(p_pos[2][0], _mm256_permutevar8x32_ps(rz, sr), v_b);
}
// Bending (Group 2: Even Center)
{
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(_mm256_permutevar8x32_ps(rl[1], sr), zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ));
__m256 lx = _mm256_permutevar8x32_ps(p_pos[0][1], sr);
__m256 ly = _mm256_permutevar8x32_ps(p_pos[1][1], sr);
__m256 lz = _mm256_permutevar8x32_ps(p_pos[2][1], sr);
__m256 lmm = _mm256_permutevar8x32_ps(inv_m[1], sr);
solve_angle(&lx, &ly, &lz, lmm,
&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
b_compl, &blm[0], t_cos, v_b, eps_v);
p_pos[0][1] = _mm256_blendv_ps(p_pos[0][1], _mm256_permutevar8x32_ps(lx, sl), v_b);
p_pos[1][1] = _mm256_blendv_ps(p_pos[1][1], _mm256_permutevar8x32_ps(ly, sl), v_b);
p_pos[2][1] = _mm256_blendv_ps(p_pos[2][1], _mm256_permutevar8x32_ps(lz, sl), v_b);
}
}
/* --- Collisions --- */
int s_base = c * SPH_PER_CHN;
const __m256 f_coef = _mm256_set1_ps(cfg->friction);
for (int s = 0; s < SPH_PER_CHN; ++s) {
__m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]);
__m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]);
__m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]);
__m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]);
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
__m256 dx = _mm256_sub_ps(p_pos[0][k], sx);
__m256 dy = _mm256_sub_ps(p_pos[1][k], sy);
__m256 dz = _mm256_sub_ps(p_pos[2][k], sz);
__m256 d2 = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
// 1. Calculate safe distance
__m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_v));
__m256 dist = _mm256_mul_ps(d2, id);
// 2. Only calculate push if there is actual penetration and sr > 0
__m256 active_sph = _mm256_cmp_ps(sr, eps_v, _CMP_GT_OQ);
__m256 colliding = _mm256_cmp_ps(dist, sr, _CMP_LT_OQ);
__m256 mask = _mm256_and_ps(active_sph, colliding);
__m256 pen = _mm256_sub_ps(sr, dist);
__m256 push = _mm256_and_ps(pen, mask);
// 3. Normal vector (zeroed if not colliding to prevent NaN propagation)
__m256 nx = _mm256_mul_ps(dx, id);
__m256 ny = _mm256_mul_ps(dy, id);
__m256 nz = _mm256_mul_ps(dz, id);
p_pos[0][k] = _mm256_add_ps(p_pos[0][k], _mm256_mul_ps(nx, push));
p_pos[1][k] = _mm256_add_ps(p_pos[1][k], _mm256_mul_ps(ny, push));
p_pos[2][k] = _mm256_add_ps(p_pos[2][k], _mm256_mul_ps(nz, push));
// 4. Friction (Stable Step-by-Step)
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
__m256 dot = _mm256_add_ps(_mm256_mul_ps(vx, nx), _mm256_add_ps(_mm256_mul_ps(vy, ny), _mm256_mul_ps(vz, nz)));
// Tangent Velocity = Total Velocity - Normal Velocity
__m256 tx = _mm256_sub_ps(vx, _mm256_mul_ps(dot, nx));
__m256 ty = _mm256_sub_ps(vy, _mm256_mul_ps(dot, ny));
__m256 tz = _mm256_sub_ps(vz, _mm256_mul_ps(dot, nz));
__m256 f_scale = _mm256_mul_ps(push, f_coef);
// Apply friction only where colliding
p_pos[0][k] = _mm256_sub_ps(p_pos[0][k], _mm256_and_ps(_mm256_mul_ps(tx, f_scale), mask));
p_pos[1][k] = _mm256_sub_ps(p_pos[1][k], _mm256_and_ps(_mm256_mul_ps(ty, f_scale), mask));
p_pos[2][k] = _mm256_sub_ps(p_pos[2][k], _mm256_and_ps(_mm256_mul_ps(tz, f_scale), mask));
}
}
/* --- Motion Radius Guardrail --- */
UNROLL_HINT
for (int k=0; k<2; ++k) {
__m256 dx = _mm256_sub_ps(p_pos[0][k], r_pos[0][k]);
__m256 dy = _mm256_sub_ps(p_pos[1][k], r_pos[1][k]);
__m256 dz = _mm256_sub_ps(p_pos[2][k], r_pos[2][k]);
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
__m256 dist = _mm256_sqrt_ps(d2);
__m256 mask = _mm256_cmp_ps(dist, m_rad[k], _CMP_GT_OQ);
__m256 scale = _mm256_div_ps(m_rad[k], _mm256_max_ps(dist, eps_v));
p_pos[0][k] = _mm256_blendv_ps(p_pos[0][k], _mm256_fmadd_ps(dx, scale, r_pos[0][k]), mask);
p_pos[1][k] = _mm256_blendv_ps(p_pos[1][k], _mm256_fmadd_ps(dy, scale, r_pos[1][k]), mask);
p_pos[2][k] = _mm256_blendv_ps(p_pos[2][k], _mm256_fmadd_ps(dz, scale, r_pos[2][k]), mask);
}
/* --- Final Store --- */
UNROLL_HINT
for (int k=0; k<2; ++k) {
int off = p_base + k*8, r_off = r_base + k*8;
_mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]);
_mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]);
_mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]);
_mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]);
_mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]);
_mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]);
_mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]);
_mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]);
}
}
}
}
#include "raylib.h"
#include "raymath.h"
#include <stdio.h>
#include <math.h>
static Vector3
GetParticleWorldPos(struct chn_sim* sim, int chain_id, int particle_idx) {
int p_base = chain_id * PTC_PER_CHN;
int phys_idx = (particle_idx % 2 == 0) ?
(p_base + particle_idx/2) :
(p_base + 8 + particle_idx/2);
float lx = sim->ptc_pos_x[phys_idx];
float ly = sim->ptc_pos_y[phys_idx];
float lz = sim->ptc_pos_z[phys_idx];
// Manual Vector3 x Quaternion rotation (v' = v + 2 * cross(q.xyz, cross(q.xyz, v) + q.w * v))
float qx = sim->chn_quat_x[chain_id];
float qy = sim->chn_quat_y[chain_id];
float qz = sim->chn_quat_z[chain_id];
float qw = sim->chn_quat_w[chain_id];
float tx = 2.0f * (qy * lz - qz * ly);
float ty = 2.0f * (qz * lx - qx * lz);
float tz = 2.0f * (qx * ly - qy * lx);
float rx = lx + qw * tx + (qy * tz - qz * ty);
float ry = ly + qw * ty + (qz * tx - qx * tz);
float rz = lz + qw * tz + (qx * ty - qy * tx);
return (Vector3){
sim->chn_pos_x[chain_id] + rx,
sim->chn_pos_y[chain_id] + ry,
sim->chn_pos_z[chain_id] + rz
};
}
extern int
main(void) {
const int screenWidth = 1280;
const int screenHeight = 720;
InitWindow(screenWidth, screenHeight, "Enshrouded Chain Physics - Diagnostic");
SetTargetFPS(60);
Camera3D camera = { 0 };
camera.position = (Vector3){ 0.0f, 2.0f, 5.0f };
camera.target = (Vector3){ 0.0f, 0.5f, 0.0f };
camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
camera.fovy = 45.0f;
camera.projection = CAMERA_PERSPECTIVE;
struct chn_sim sim;
chn_sim_init(&sim);
struct chn_cfg cfg = {
.damping = 0.99f,
.stiffness = 0.5f,
.bend_stiffness = 10.0f,
.friction = 0.5f,
.linear_inertia = 0.1f,
.rest_curvature = 0.0f
};
// Create chain
float local_rest_pos[16 * 4];
for (int i = 0; i < 16; ++i) {
local_rest_pos[i*4 + 0] = 0.0f;
local_rest_pos[i*4 + 1] = -(0.15f * i); // Offset downwards from root
local_rest_pos[i*4 + 2] = 0.0f;
local_rest_pos[i*4 + 3] = (i==0) ? 0.0f : 1.0f; // Root fixed at 0,0,0 local
}
chn_sim_add(&sim, &cfg, local_rest_pos, 16);
float grav[] = { 0.0f, -9.81f, 0.0f };
chn_sim_grav(&sim, 0, grav);
// Set initial world location
float p_init[] = { 0.0f, 2.0f, 0.0f }; // Start at height 2.0
float r_init[] = { 0, 0, 0, 1 };
chn_sim_tel(&sim, 0, p_init, r_init);
Vector3 rootPos = {0.0f, 2.0f, 0.0f};
Vector3 spherePos = { 0.0f, 0.8f, 0.2f };
float sphereRadius = 0.7f;
while (!WindowShouldClose()) {
if (IsKeyPressed(KEY_Q)) {
break;
}
float dt = 0.016f; // Fixed DT for stable debugging
if (IsKeyDown(KEY_UP)) {
rootPos.z -= 4.0f * dt;
}
if (IsKeyDown(KEY_DOWN)) {
rootPos.z += 4.0f * dt;
}
if (IsKeyDown(KEY_LEFT)) {
rootPos.x -= 4.0f * dt;
}
if (IsKeyDown(KEY_RIGHT)) {
rootPos.x += 4.0f * dt;
}
// Move Sphere
if (IsKeyDown(KEY_W)) {
spherePos.z -= 2.0f * dt;
}
if (IsKeyDown(KEY_S)) {
spherePos.z += 2.0f * dt;
}
if (IsKeyDown(KEY_A)) {
spherePos.x -= 2.0f * dt;
}
if (IsKeyDown(KEY_D)){
spherePos.x += 2.0f * dt;
}
UpdateCamera(&camera, CAMERA_ORBITAL);
// 1. Move the World Space Root and pass movement to the simulator
rootPos.x += sinf(GetTime() * 2.0f) * 0.005f;
float p[] = { rootPos.x, rootPos.y, rootPos.z };
float r[] = { 0, 0, 0, 1 }; // Identity rotation for now
chn_sim_mov(&sim, 0, p, r);
// 3. Update Colliders (MUST be in Local Space relative to rootPos)
float local_sph[] = {
spherePos.x - rootPos.x,
spherePos.y - rootPos.y,
spherePos.z - rootPos.z,
sphereRadius
};
chn_sim_set_sph(&sim, 0, local_sph, 1);
chn_sim_run(&sim, dt);
BeginDrawing();
ClearBackground(DARKGRAY);
BeginMode3D(camera);
DrawGrid(10, 1.0f);
DrawSphere(spherePos, sphereRadius, Fade(RED, 0.3f));
for(int i=0; i<15; i++) {
Vector3 p1 = GetParticleWorldPos(&sim, 0, i);
Vector3 p2 = GetParticleWorldPos(&sim, 0, i+1);
DrawCapsule(p1, p2, 0.02f, 4, 4, GOLD);
}
EndMode3D();
DrawFPS(10, 10);
EndDrawing();
}
CloseWindow();
return 0;
}
@vurtun
Copy link
Author

vurtun commented Dec 19, 2025

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment