Last active
December 30, 2025 22:14
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * ====================================================================================== | |
| * XPBD CHAIN SIMULATION LIBRARY | |
| * ====================================================================================== | |
| * OVERVIEW: | |
| * Physics simulation designed specifically for high-fidelity decorative chains | |
| * in AAA games (jewelry, hair braids, lantern holders, armor attachments). | |
| * It prioritizes visual stability, artistic control, and extreme CPU efficiency. | |
| * | |
| * ALGORITHM & ARCHITECTURE: | |
| * - Core Solver: Extended Position Based Dynamics (XPBD). Ensures infinite stiffness | |
| * stability and frame-rate independence via compliance (alpha / dt^2). | |
| * - Memory Layout: "Transposed Structure-of-Arrays". Particles are stored in interleaved | |
| * Even/Odd arrays, allowing 8-wide AVX operations on a single chain without scatter/gather. | |
| * - Pipeline: Hybrid "Blocked" execution. Global forces are computed 8-chains-wide (Parallel), | |
| * while constraint solving is performed per-chain (Sequential) but instruction-interleaved | |
| * to maximize register utilization and hide ALU latency. | |
| * - Precision: Uses Newton-Raphson refinement for inverse square roots to prevent | |
| * instability in angle constraints at high tension. | |
| * | |
| * KEY FEATURES: | |
| * 1. Art-Directable Curvature: | |
| * True Cosine-based angle constraints allow chains to hold a natural curl | |
| * (e.g., gold necklaces, stiff cables) via `rest_curvature`. | |
| * 2. Root Motion Blending: | |
| * Configurable inertia blend prevents the "floating/lagging" artifact common in | |
| * character accessories. Chains can inherit parent acceleration seamlessly. | |
| * 3. Robust Collision & Friction: | |
| * - Sphere-based collision with "Velocity Kill" friction logic. | |
| * - Prevents "jitter" when chains rest on surfaces or get pinched. | |
| * - Handles spawn-inside-collider edge cases gracefully. | |
| * 4. Environmental Interaction: | |
| * - World-space Wind and Gravity support. | |
| * - Hard Motion Radius constraints to prevent clipping through macro-geometry. | |
| * | |
| * PERFORMANCE CHARACTERISTICS: | |
| * - Zero dynamic allocation (Stack/Static memory only). | |
| * - Optimized for L1 Cache residency. | |
| * - Inner solver loop fits entirely within 16 YMM registers (Zero stack spilling). | |
| * - Throughput: Capable of simulating hundreds of chains in < 0.5ms on modern CPUs. | |
| * | |
| * CONSTRAINTS & LIMITATIONS: | |
| * - Hardware: Requires CPU support for AVX2 and FMA3 (Intel Haswell+, AMD Zen+). | |
| * - Topology: Strictly linear chains (no branching, no cloth grids). | |
| * - Capacity: Fixed compile-time limits (128 chains, 16 particles/chain). | |
| * (Optimized for 16-particle chains; fewer is possible but wastes SIMD lanes). | |
| * | |
| * USAGE: | |
| * 1. Call chn_sim_init() once. | |
| * 2. Add chains via chn_sim_add(). | |
| * 3. Update root transforms/colliders per frame via chn_sim_mov/chn_sim_set_sph. | |
| * 4. Call chn_sim_run() with a fixed or variable delta time. | |
| * | |
| * ====================================================================================== | |
| */ | |
| #include <immintrin.h> | |
| #include <math.h> | |
| #include <assert.h> | |
| #include <string.h> | |
| /* --- Compiler Feature Macros --- */ | |
| #if defined(_MSC_VER) | |
| #define ALWAYS_INLINE __forceinline | |
| #define RESTRICT __restrict | |
| #define ALIGN_VAR(x) __declspec(align(x)) | |
| #define UNROLL_HINT __pragma(loop(unroll)) | |
| #elif defined(__GNUC__) || defined(__clang__) | |
| #define ALWAYS_INLINE __attribute__((always_inline)) inline | |
| #define RESTRICT __restrict__ | |
| #define ALIGN_VAR(x) __attribute__((aligned(x))) | |
| #define UNROLL_HINT _Pragma("GCC unroll 2") | |
| #else | |
| #define ALWAYS_INLINE inline | |
| #define RESTRICT | |
| #define ALIGN_VAR(x) | |
| #define UNROLL_HINT | |
| #endif | |
| /* --- Constants --- */ | |
| #define MAX_MOTION_RADIUS 10000.0f | |
| enum { | |
| MAX_CHNS = 128, | |
| PTC_PER_CHN = 16, | |
| SPH_PER_CHN = 8, | |
| MAX_PTC = (MAX_CHNS * PTC_PER_CHN), | |
| MAX_SPH = (MAX_CHNS * SPH_PER_CHN), | |
| CON_PER_CHN = PTC_PER_CHN, | |
| MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), | |
| MAX_ITR = 16, | |
| }; | |
| struct chn_cfg { | |
| float damping; | |
| float stiffness; | |
| float friction; | |
| float linear_inertia; | |
| float angular_inertia; | |
| float centrifugal_inertia; | |
| float bend_stiffness; | |
| float rest_curvature; | |
| float _pad[8]; // Ensure alignment for SoA | |
| } ALIGN_VAR(32); | |
| struct chn_sim { | |
| unsigned char free_idx_cnt; | |
| unsigned char free_idx[MAX_CHNS]; | |
| // World Transforms (Anchor Point) | |
| float chn_pos_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_pos_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_pos_z[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_quat_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_quat_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_quat_z[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_quat_w[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_grav_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_grav_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_grav_z[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_wind_x[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_wind_y[MAX_CHNS] ALIGN_VAR(32); | |
| float chn_wind_z[MAX_CHNS] ALIGN_VAR(32); | |
| struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32); | |
| // Local Particles | |
| float ptc_pos_x[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_pos_y[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_pos_z[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32); | |
| float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32); | |
| // Motion Radius (Hard Constraints) | |
| float rest_pos_x[MAX_PTC] ALIGN_VAR(32); | |
| float rest_pos_y[MAX_PTC] ALIGN_VAR(32); | |
| float rest_pos_z[MAX_PTC] ALIGN_VAR(32); | |
| float motion_radius[MAX_PTC] ALIGN_VAR(32); | |
| // Constraints | |
| float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32); | |
| float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32); | |
| float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32); | |
| // Colliders | |
| float sph_x[MAX_SPH] ALIGN_VAR(32); | |
| float sph_y[MAX_SPH] ALIGN_VAR(32); | |
| float sph_z[MAX_SPH] ALIGN_VAR(32); | |
| float sph_r[MAX_SPH] ALIGN_VAR(32); | |
| }; | |
| static int | |
| chn_sim__val(struct chn_sim *cns) { | |
| assert(cns); | |
| assert(cns->free_idx_cnt <= MAX_CHNS); | |
| return 1; | |
| } | |
| extern void | |
| chn_sim_init(struct chn_sim *cns) { | |
| assert(cns); | |
| memset(cns, 0, sizeof(*cns)); | |
| cns->free_idx_cnt = MAX_CHNS; | |
| for (int i = 0; i < MAX_CHNS; ++i) { | |
| cns->free_idx[i] = MAX_CHNS - i - 1; | |
| cns->chn_quat_w[i] = 1.0f; | |
| cns->chn_prev_quat_w[i] = 1.0f; | |
| } | |
| for (int i = 0; i < MAX_PTC; ++i) { | |
| cns->motion_radius[i] = MAX_MOTION_RADIUS; | |
| } | |
| } | |
| extern int | |
| chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) { | |
| assert(cns && cfg && rest_pos); | |
| assert(cns->free_idx_cnt > 0); | |
| int chn = cns->free_idx[--cns->free_idx_cnt]; | |
| cns->chn_cfg[chn] = *cfg; | |
| int p_base = (chn * PTC_PER_CHN); | |
| int r_base = (chn * CON_PER_CHN); | |
| for (int i = 0; i < PTC_PER_CHN; ++i) { | |
| int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1); | |
| int physical_i = p_base + i; | |
| if (logical_i < cnt) { | |
| cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0]; | |
| cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1]; | |
| cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2]; | |
| cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3]; | |
| cns->motion_radius[physical_i] = MAX_MOTION_RADIUS; | |
| } else { | |
| cns->ptc_pos_x[physical_i] = 0.0f; | |
| cns->ptc_pos_y[physical_i] = 0.0f; | |
| cns->ptc_pos_z[physical_i] = 0.0f; | |
| cns->ptc_inv_mass[physical_i] = 0.0f; | |
| cns->motion_radius[physical_i] = MAX_MOTION_RADIUS; | |
| } | |
| cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i]; | |
| cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i]; | |
| cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i]; | |
| cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i]; | |
| cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i]; | |
| cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i]; | |
| } | |
| for (int i = 0; i < PTC_PER_CHN; ++i) { | |
| int physical_r = r_base + i; | |
| cns->ptc_rest_len[physical_r] = 0.0f; | |
| cns->ptc_lambda[physical_r] = 0.0f; | |
| cns->ptc_bend_lambda[physical_r] = 0.0f; | |
| int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1); | |
| if (logical_con_start + 1 < cnt) { | |
| float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0]; | |
| float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1]; | |
| float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2]; | |
| float len = sqrtf(dx*dx + dy*dy + dz*dz); | |
| cns->ptc_rest_len[physical_r] = (len > 1e-6f) ? len : 1e-6f; | |
| } | |
| } | |
| return chn; | |
| } | |
| extern void | |
| chn_sim_del(struct chn_sim *cns, int chn) { | |
| assert(chn >= 0 && chn < MAX_CHNS); | |
| int p_base = (chn * PTC_PER_CHN); | |
| int r_base = (chn * CON_PER_CHN); | |
| int s_base = (chn * SPH_PER_CHN); | |
| cns->free_idx[cns->free_idx_cnt++] = chn; | |
| memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); | |
| cns->chn_quat_x[chn] = 0.0f; | |
| cns->chn_quat_y[chn] = 0.0f; | |
| cns->chn_quat_z[chn] = 0.0f; | |
| cns->chn_quat_w[chn] = 1.0f; | |
| cns->chn_prev_quat_x[chn] = 0.0f; | |
| cns->chn_prev_quat_y[chn] = 0.0f; | |
| cns->chn_prev_quat_z[chn] = 0.0f; | |
| cns->chn_prev_quat_w[chn] = 1.0f; | |
| memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float)); | |
| memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float)); | |
| memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float)); | |
| memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float)); | |
| memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float)); | |
| for (int i = 0; i < PTC_PER_CHN; ++i) { | |
| cns->motion_radius[p_base + i] = MAX_MOTION_RADIUS; | |
| } | |
| memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float)); | |
| } | |
| extern void | |
| chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) { | |
| cns->chn_pos_x[chn] = pos3[0]; | |
| cns->chn_pos_y[chn] = pos3[1]; | |
| cns->chn_pos_z[chn] = pos3[2]; | |
| cns->chn_quat_x[chn] = rot4[0]; | |
| cns->chn_quat_y[chn] = rot4[1]; | |
| cns->chn_quat_z[chn] = rot4[2]; | |
| cns->chn_quat_w[chn] = rot4[3]; | |
| } | |
| extern void | |
| chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) { | |
| chn_sim_mov(cns, chn, pos3, rot4); | |
| cns->chn_prev_pos_x[chn] = pos3[0]; | |
| cns->chn_prev_pos_y[chn] = pos3[1]; | |
| cns->chn_prev_pos_z[chn] = pos3[2]; | |
| cns->chn_prev_quat_x[chn] = rot4[0]; | |
| cns->chn_prev_quat_y[chn] = rot4[1]; | |
| cns->chn_prev_quat_z[chn] = rot4[2]; | |
| cns->chn_prev_quat_w[chn] = rot4[3]; | |
| } | |
| extern void | |
| chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) { | |
| cns->chn_grav_x[chn] = grav3[0]; | |
| cns->chn_grav_y[chn] = grav3[1]; | |
| cns->chn_grav_z[chn] = grav3[2]; | |
| } | |
| extern void | |
| chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) { | |
| cns->chn_wind_x[chn] = wind3[0]; | |
| cns->chn_wind_y[chn] = wind3[1]; | |
| cns->chn_wind_z[chn] = wind3[2]; | |
| } | |
| extern void | |
| chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) { | |
| int s_base = chn * SPH_PER_CHN; | |
| for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) { | |
| cns->sph_x[s_base + i] = spheres[i*4+0]; | |
| cns->sph_y[s_base + i] = spheres[i*4+1]; | |
| cns->sph_z[s_base + i] = spheres[i*4+2]; | |
| cns->sph_r[s_base + i] = spheres[i*4+3]; | |
| } | |
| for (int i = cnt; i < SPH_PER_CHN; ++i) { | |
| cns->sph_r[s_base + i] = 0.0f; | |
| } | |
| } | |
| extern void | |
| chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) { | |
| int p_base = chn * PTC_PER_CHN; | |
| for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) { | |
| int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8)); | |
| cns->motion_radius[physical_i] = radius[i]; | |
| } | |
| } | |
| static ALWAYS_INLINE void | |
| quat_mul_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, | |
| __m256 *RESTRICT rz, __m256 *RESTRICT rw, | |
| __m256 x1, __m256 y1, __m256 z1, __m256 w1, | |
| __m256 x2, __m256 y2, __m256 z2, __m256 w2) { | |
| // 1. Compute Cross Product terms (Q1.xyz x Q2.xyz) | |
| __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(y1, z2), _mm256_mul_ps(z1, y2)); | |
| __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(z1, x2), _mm256_mul_ps(x1, z2)); | |
| __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(x1, y2), _mm256_mul_ps(y1, x2)); | |
| // 2. Compute Dot Product term (Q1.xyz . Q2.xyz) | |
| __m256 dot_x = _mm256_mul_ps(x1, x2); | |
| __m256 dot_y = _mm256_mul_ps(y1, y2); | |
| __m256 dot_z = _mm256_mul_ps(z1, z2); | |
| __m256 dot_sum = _mm256_add_ps(dot_x, _mm256_add_ps(dot_y, dot_z)); | |
| // 3. Compute W scale terms | |
| __m256 w1x2 = _mm256_mul_ps(w1, x2); | |
| __m256 x1w2 = _mm256_mul_ps(x1, w2); | |
| __m256 w1y2 = _mm256_mul_ps(w1, y2); | |
| __m256 y1w2 = _mm256_mul_ps(y1, w2); | |
| __m256 w1z2 = _mm256_mul_ps(w1, z2); | |
| __m256 z1w2 = _mm256_mul_ps(z1, w2); | |
| // 4. Final Assembly | |
| // X = (w1*x2 + x1*w2) - cross_x | |
| *rx = _mm256_sub_ps(_mm256_add_ps(w1x2, x1w2), cross_x); | |
| // Y = (w1*y2 + y1*w2) - cross_y | |
| *ry = _mm256_sub_ps(_mm256_add_ps(w1y2, y1w2), cross_y); | |
| // Z = (w1*z2 + z1*w2) - cross_z | |
| *rz = _mm256_sub_ps(_mm256_add_ps(w1z2, z1w2), cross_z); | |
| // W = (w1*w2) - dot_sum | |
| *rw = _mm256_sub_ps(_mm256_mul_ps(w1, w2), dot_sum); | |
| } | |
| static ALWAYS_INLINE void | |
| rot_vec_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, | |
| __m256 qx, __m256 qy, __m256 qz, __m256 qw, | |
| __m256 vx, __m256 vy, __m256 vz) { | |
| __m256 two = _mm256_set1_ps(2.0f); | |
| // 1. Calculate Q.xyz x V | |
| __m256 q_cross_v_x = _mm256_sub_ps(_mm256_mul_ps(qy, vz), _mm256_mul_ps(qz, vy)); | |
| __m256 q_cross_v_y = _mm256_sub_ps(_mm256_mul_ps(qz, vx), _mm256_mul_ps(qx, vz)); | |
| __m256 q_cross_v_z = _mm256_sub_ps(_mm256_mul_ps(qx, vy), _mm256_mul_ps(qy, vx)); | |
| // 2. Calculate T = 2 * (Q.xyz x V) | |
| __m256 tx = _mm256_mul_ps(two, q_cross_v_x); | |
| __m256 ty = _mm256_mul_ps(two, q_cross_v_y); | |
| __m256 tz = _mm256_mul_ps(two, q_cross_v_z); | |
| // 3. Calculate Q.xyz x T | |
| __m256 q_cross_t_x = _mm256_sub_ps(_mm256_mul_ps(qy, tz), _mm256_mul_ps(qz, ty)); | |
| __m256 q_cross_t_y = _mm256_sub_ps(_mm256_mul_ps(qz, tx), _mm256_mul_ps(qx, tz)); | |
| __m256 q_cross_t_z = _mm256_sub_ps(_mm256_mul_ps(qx, ty), _mm256_mul_ps(qy, tx)); | |
| // 4. Calculate Q.w * T | |
| __m256 w_times_t_x = _mm256_mul_ps(qw, tx); | |
| __m256 w_times_t_y = _mm256_mul_ps(qw, ty); | |
| __m256 w_times_t_z = _mm256_mul_ps(qw, tz); | |
| // 5. Final Result: V + (Q.w * T) + (Q.xyz x T) | |
| *rx = _mm256_add_ps(vx, _mm256_add_ps(w_times_t_x, q_cross_t_x)); | |
| *ry = _mm256_add_ps(vy, _mm256_add_ps(w_times_t_y, q_cross_t_y)); | |
| *rz = _mm256_add_ps(vz, _mm256_add_ps(w_times_t_z, q_cross_t_z)); | |
| } | |
| static ALWAYS_INLINE void | |
| rot_vec_inv_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, | |
| __m256 qx, __m256 qy, __m256 qz, __m256 qw, | |
| __m256 vx, __m256 vy, __m256 vz) { | |
| // Conjugate the quaternion (-x, -y, -z, w) | |
| __m256 neg_qx = _mm256_sub_ps(_mm256_setzero_ps(), qx); | |
| __m256 neg_qy = _mm256_sub_ps(_mm256_setzero_ps(), qy); | |
| __m256 neg_qz = _mm256_sub_ps(_mm256_setzero_ps(), qz); | |
| // Perform standard rotation using the conjugate | |
| rot_vec_batch8(rx, ry, rz, neg_qx, neg_qy, neg_qz, qw, vx, vy, vz); | |
| } | |
| static ALWAYS_INLINE void | |
| solve_dist(__m256 *RESTRICT p1x, __m256 *RESTRICT p1y, __m256 *RESTRICT p1z, | |
| __m256 invm1, __m256 *RESTRICT p2x, | |
| __m256 *RESTRICT p2y, __m256 *RESTRICT p2z, | |
| __m256 invm2, __m256 rest_len, __m256 compliance, | |
| __m256 *RESTRICT lambda, __m256 eps) { | |
| __m256 dx = _mm256_sub_ps(*p1x, *p2x); | |
| __m256 dy = _mm256_sub_ps(*p1y, *p2y); | |
| __m256 dz = _mm256_sub_ps(*p1z, *p2z); | |
| __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz))); | |
| __m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps)); | |
| __m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), | |
| _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(d2, inv_d), rest_len), | |
| _mm256_mul_ps(compliance, *lambda))), | |
| _mm256_add_ps(_mm256_add_ps(invm1, invm2), compliance)); | |
| dL = _mm256_and_ps(dL, _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ)); | |
| *lambda = _mm256_add_ps(*lambda, dL); | |
| __m256 px = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL); | |
| __m256 py = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL); | |
| __m256 pz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL); | |
| *p1x = _mm256_fmadd_ps(px, invm1, *p1x); | |
| *p1y = _mm256_fmadd_ps(py, invm1, *p1y); | |
| *p1z = _mm256_fmadd_ps(pz, invm1, *p1z); | |
| *p2x = _mm256_fnmadd_ps(px, invm2, *p2x); | |
| *p2y = _mm256_fnmadd_ps(py, invm2, *p2y); | |
| *p2z = _mm256_fnmadd_ps(pz, invm2, *p2z); | |
| } | |
| static ALWAYS_INLINE void | |
| solve_angle(__m256 RESTRICT* p0x, __m256 RESTRICT* p0y, __m256 RESTRICT *p0z, | |
| __m256 invm0, __m256 *RESTRICT p1x, __m256 *RESTRICT p1y, | |
| __m256 *RESTRICT p1z, __m256 invm1, __m256 *RESTRICT p2x, | |
| __m256 *RESTRICT p2y, __m256 *RESTRICT p2z, __m256 invm2, | |
| __m256 compliance, __m256 *RESTRICT lambda, __m256 target_cos, | |
| __m256 mask, __m256 eps) { | |
| __m256 e1x = _mm256_sub_ps(*p1x, *p0x); | |
| __m256 e1y = _mm256_sub_ps(*p1y, *p0y); | |
| __m256 e1z = _mm256_sub_ps(*p1z, *p0z); | |
| __m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z))); | |
| __m256 i1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps)); | |
| __m256 e2x = _mm256_sub_ps(*p2x, *p1x); | |
| __m256 e2y = _mm256_sub_ps(*p2y, *p1y); | |
| __m256 e2z = _mm256_sub_ps(*p2z, *p1z); | |
| __m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z))); | |
| __m256 i2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps)); | |
| __m256 n1x = _mm256_mul_ps(e1x, i1); | |
| __m256 n1y = _mm256_mul_ps(e1y, i1); | |
| __m256 n1z = _mm256_mul_ps(e1z, i1); | |
| __m256 n2x = _mm256_mul_ps(e2x, i2); | |
| __m256 n2y = _mm256_mul_ps(e2y, i2); | |
| __m256 n2z = _mm256_mul_ps(e2z, i2); | |
| __m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z))); | |
| __m256 C = _mm256_sub_ps(dot, target_cos); | |
| __m256 gp0x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1x, n2x), i1); | |
| __m256 gp0y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1y, n2y), i1); | |
| __m256 gp0z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1z, n2z), i1); | |
| __m256 gp2x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2x, n1x), _mm256_sub_ps(_mm256_setzero_ps(), i2)); | |
| __m256 gp2y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2y, n1y), _mm256_sub_ps(_mm256_setzero_ps(), i2)); | |
| __m256 gp2z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2z, n1z), _mm256_sub_ps(_mm256_setzero_ps(), i2)); | |
| __m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x)); | |
| __m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y)); | |
| __m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z)); | |
| __m256 w = _mm256_add_ps(_mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z)))), | |
| _mm256_add_ps(_mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z)))), | |
| _mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z)))))); | |
| __m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), _mm256_add_ps(w, compliance)); | |
| dL = _mm256_and_ps(dL, mask); // CRITICAL: Mask out the delta lambda for the wrap-around lane | |
| *lambda = _mm256_add_ps(*lambda, dL); | |
| *p0x = _mm256_fmadd_ps(gp0x, _mm256_mul_ps(dL, invm0), *p0x); | |
| *p0y = _mm256_fmadd_ps(gp0y, _mm256_mul_ps(dL, invm0), *p0y); | |
| *p0z = _mm256_fmadd_ps(gp0z, _mm256_mul_ps(dL, invm0), *p0z); | |
| *p1x = _mm256_fmadd_ps(gp1x, _mm256_mul_ps(dL, invm1), *p1x); | |
| *p1y = _mm256_fmadd_ps(gp1y, _mm256_mul_ps(dL, invm1), *p1y); | |
| *p1z = _mm256_fmadd_ps(gp1z, _mm256_mul_ps(dL, invm1), *p1z); | |
| *p2x = _mm256_fmadd_ps(gp2x, _mm256_mul_ps(dL, invm2), *p2x); | |
| *p2y = _mm256_fmadd_ps(gp2y, _mm256_mul_ps(dL, invm2), *p2y); | |
| *p2z = _mm256_fmadd_ps(gp2z, _mm256_mul_ps(dL, invm2), *p2z); | |
| } | |
| /* --- Simulation Loop --- */ | |
| extern void | |
| chn_sim_run(struct chn_sim *RESTRICT cns, float dt) { | |
| const __m256 dt2_v = _mm256_set1_ps(dt * dt); | |
| const __m256 eps_v = _mm256_set1_ps(1e-7f); | |
| const __m256 zero_v = _mm256_setzero_ps(); | |
| const __m256i sl = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); | |
| const __m256i sr = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6); | |
| ALIGN_VAR(32) float b_fx[8], b_fy[8], b_fz[8], b_vx[8], b_vy[8], b_vz[8], b_qrx[8], b_qry[8], b_qrz[8], b_qrw[8]; | |
| for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) { | |
| /* --- PHASE 1: Batched Anchor Updates --- */ | |
| { | |
| __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]); | |
| __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]); | |
| __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]); | |
| __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]); | |
| __m256 pqx = _mm256_load_ps(&cns->chn_prev_quat_x[c_blk]); | |
| __m256 pqy = _mm256_load_ps(&cns->chn_prev_quat_y[c_blk]); | |
| __m256 pqz = _mm256_load_ps(&cns->chn_prev_quat_z[c_blk]); | |
| __m256 pqw = _mm256_load_ps(&cns->chn_prev_quat_w[c_blk]); | |
| __m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]); | |
| __m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]); | |
| __m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]); | |
| __m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]); | |
| __m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]); | |
| __m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]); | |
| __m256 wfx, wfy, wfz; | |
| rot_vec_inv_batch8(&wfx, &wfy, &wfz, qx, qy, qz, qw, _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_x[c_blk]), _mm256_load_ps(&cns->chn_wind_x[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_y[c_blk]), _mm256_load_ps(&cns->chn_wind_y[c_blk])), _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_z[c_blk]), _mm256_load_ps(&cns->chn_wind_z[c_blk]))); | |
| _mm256_store_ps(b_fx, wfx); | |
| _mm256_store_ps(b_fy, wfy); | |
| _mm256_store_ps(b_fz, wfz); | |
| __m256 lvx, lvy, lvz; rot_vec_inv_batch8(&lvx, &lvy, &lvz, qx, qy, qz, qw, _mm256_sub_ps(cx, px), _mm256_sub_ps(cy, py), _mm256_sub_ps(cz, pz)); | |
| _mm256_store_ps(b_vx, lvx); | |
| _mm256_store_ps(b_vy, lvy); | |
| _mm256_store_ps(b_vz, lvz); | |
| __m256 qrx, qry, qrz, qrw; | |
| quat_mul_batch8(&qrx, &qry, &qrz, &qrw, _mm256_sub_ps(zero_v, qx), _mm256_sub_ps(zero_v, qy), _mm256_sub_ps(zero_v, qz), qw, pqx, pqy, pqz, pqw); | |
| _mm256_store_ps(b_qrx, qrx); | |
| _mm256_store_ps(b_qry, qry); | |
| _mm256_store_ps(b_qrz, qrz); | |
| _mm256_store_ps(b_qrw, qrw); | |
| _mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx); | |
| _mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy); | |
| _mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz); | |
| _mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx); | |
| _mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy); | |
| _mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz); | |
| _mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw); | |
| } | |
| /* --- PHASE 2: Local Chain Simulation --- */ | |
| for (int i = 0; i < 8; ++i) { | |
| int c = c_blk + i; | |
| int p_base = c * PTC_PER_CHN, r_base = c * CON_PER_CHN; | |
| struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
| __m256 damp = _mm256_set1_ps(cfg->damping); | |
| __m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v)); | |
| __m256 b_compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_v)); | |
| __m256 t_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f)); | |
| __m256 qrx = _mm256_set1_ps(b_qrx[i]); | |
| __m256 qry = _mm256_set1_ps(b_qry[i]); | |
| __m256 qrz = _mm256_set1_ps(b_qrz[i]); | |
| __m256 qrw = _mm256_set1_ps(b_qrw[i]); | |
| __m256 o_sq = _mm256_add_ps(_mm256_mul_ps(qrx, qrx), _mm256_add_ps(_mm256_mul_ps(qry, qry), _mm256_mul_ps(qrz, qrz))); | |
| __m256 lv_x = _mm256_set1_ps(b_vx[i]); | |
| __m256 lv_y = _mm256_set1_ps(b_vy[i]); | |
| __m256 lv_z = _mm256_set1_ps(b_vz[i]); | |
| __m256 i_l = _mm256_set1_ps(cfg->linear_inertia); | |
| __m256 i_a = _mm256_set1_ps(cfg->angular_inertia); | |
| __m256 i_c = _mm256_set1_ps(cfg->centrifugal_inertia); | |
| __m256 p_pos[3][2], p_prev[3][2]; | |
| __m256 inv_m[2], rl[2], lm[2], blm[2], r_pos[3][2], m_rad[2]; | |
| UNROLL_HINT | |
| for (int k=0; k<2; ++k) { | |
| int off = p_base + k*8; | |
| int r_off = r_base + k*8; | |
| p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]); | |
| p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]); | |
| p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]); | |
| p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]); | |
| p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]); | |
| p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]); | |
| inv_m[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]); | |
| rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]); | |
| lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]); | |
| blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]); | |
| r_pos[0][k] = _mm256_load_ps(&cns->rest_pos_x[off]); | |
| r_pos[1][k] = _mm256_load_ps(&cns->rest_pos_y[off]); | |
| r_pos[2][k] = _mm256_load_ps(&cns->rest_pos_z[off]); | |
| m_rad[k] = _mm256_load_ps(&cns->motion_radius[off]); | |
| } | |
| /* --- Predict + Inertia --- */ | |
| UNROLL_HINT | |
| for (int k=0; k<2; ++k) { | |
| __m256 dyn = _mm256_cmp_ps(inv_m[k], zero_v, _CMP_GT_OQ); | |
| __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]); | |
| __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]); | |
| __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]); | |
| vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(lv_x, i_l), dyn)); | |
| vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(lv_y, i_l), dyn)); | |
| vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(lv_z, i_l), dyn)); | |
| __m256 rax, ray, raz; | |
| rot_vec_batch8(&rax, &ray, &raz, qrx, qry, qrz, qrw, p_pos[0][k], p_pos[1][k], p_pos[2][k]); | |
| vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[0][k], rax), i_a), dyn)); | |
| vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[1][k], ray), i_a), dyn)); | |
| vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[2][k], raz), i_a), dyn)); | |
| vx = _mm256_add_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[0][k], o_sq), i_c), dyn)); | |
| vy = _mm256_add_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[1][k], o_sq), i_c), dyn)); | |
| vz = _mm256_add_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[2][k], o_sq), i_c), dyn)); | |
| p_prev[0][k] = p_pos[0][k]; | |
| p_prev[1][k] = p_pos[1][k]; | |
| p_prev[2][k] = p_pos[2][k]; | |
| __m256 fx = _mm256_set1_ps(b_fx[i]); | |
| __m256 fy = _mm256_set1_ps(b_fy[i]); | |
| __m256 fz = _mm256_set1_ps(b_fz[i]); | |
| p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(fx, inv_m[k]), dt2_v, _mm256_fmadd_ps(vx, damp, p_pos[0][k])); | |
| p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(fy, inv_m[k]), dt2_v, _mm256_fmadd_ps(vy, damp, p_pos[1][k])); | |
| p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(fz, inv_m[k]), dt2_v, _mm256_fmadd_ps(vz, damp, p_pos[2][k])); | |
| } | |
| /* --- XPBD Solver Iterations --- */ | |
| for (int it = 0; it < MAX_ITR; ++it) { | |
| { | |
| // Even(i) <-> Odd(i) | |
| __m256 p2x = p_pos[0][1]; | |
| __m256 p2y = p_pos[1][1]; | |
| __m256 p2z = p_pos[2][1]; | |
| __m256 p2m = inv_m[1]; | |
| solve_dist(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], | |
| &p2x, &p2y, &p2z, p2m, rl[0], compl, &lm[0], eps_v); | |
| // Direct write-back for k=0 | |
| p_pos[0][1] = p2x; | |
| p_pos[1][1] = p2y; | |
| p_pos[2][1] = p2z; | |
| } | |
| { | |
| // Odd(i) <-> Shifted Even(i+1) | |
| __m256 p2x = _mm256_permutevar8x32_ps(p_pos[0][0], sl); | |
| __m256 p2y = _mm256_permutevar8x32_ps(p_pos[1][0], sl); | |
| __m256 p2z = _mm256_permutevar8x32_ps(p_pos[2][0], sl); | |
| __m256 p2m = _mm256_permutevar8x32_ps(inv_m[0], sl); | |
| __m256 o2x = p2x; | |
| __m256 o2y = p2y; | |
| __m256 o2z = p2z; | |
| solve_dist(&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], | |
| &p2x, &p2y, &p2z, p2m, rl[1], compl, &lm[1], eps_v); | |
| // Shifted accumulation for k=1 | |
| p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2x, o2x), sr)); | |
| p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2y, o2y), sr)); | |
| p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2z, o2z), sr)); | |
| } | |
| // Bending (Group 1: Odd Center) | |
| { | |
| __m256 v_b = _mm256_and_ps(_mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[1], zero_v, _CMP_GT_OQ)); | |
| __m256 rx = _mm256_permutevar8x32_ps(p_pos[0][0], sl); | |
| __m256 ry = _mm256_permutevar8x32_ps(p_pos[1][0], sl); | |
| __m256 rz = _mm256_permutevar8x32_ps(p_pos[2][0], sl); | |
| __m256 rm = _mm256_permutevar8x32_ps(inv_m[0], sl); | |
| solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], | |
| &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], &rx, &ry, &rz, rm, | |
| b_compl, &blm[1], t_cos, v_b, eps_v); | |
| p_pos[0][0] = _mm256_blendv_ps(p_pos[0][0], _mm256_permutevar8x32_ps(rx, sr), v_b); | |
| p_pos[1][0] = _mm256_blendv_ps(p_pos[1][0], _mm256_permutevar8x32_ps(ry, sr), v_b); | |
| p_pos[2][0] = _mm256_blendv_ps(p_pos[2][0], _mm256_permutevar8x32_ps(rz, sr), v_b); | |
| } | |
| // Bending (Group 2: Even Center) | |
| { | |
| __m256 v_b = _mm256_and_ps(_mm256_cmp_ps(_mm256_permutevar8x32_ps(rl[1], sr), zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ)); | |
| __m256 lx = _mm256_permutevar8x32_ps(p_pos[0][1], sr); | |
| __m256 ly = _mm256_permutevar8x32_ps(p_pos[1][1], sr); | |
| __m256 lz = _mm256_permutevar8x32_ps(p_pos[2][1], sr); | |
| __m256 lmm = _mm256_permutevar8x32_ps(inv_m[1], sr); | |
| solve_angle(&lx, &ly, &lz, lmm, | |
| &p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], | |
| &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], | |
| b_compl, &blm[0], t_cos, v_b, eps_v); | |
| p_pos[0][1] = _mm256_blendv_ps(p_pos[0][1], _mm256_permutevar8x32_ps(lx, sl), v_b); | |
| p_pos[1][1] = _mm256_blendv_ps(p_pos[1][1], _mm256_permutevar8x32_ps(ly, sl), v_b); | |
| p_pos[2][1] = _mm256_blendv_ps(p_pos[2][1], _mm256_permutevar8x32_ps(lz, sl), v_b); | |
| } | |
| } | |
| /* --- Collisions --- */ | |
| int s_base = c * SPH_PER_CHN; | |
| const __m256 f_coef = _mm256_set1_ps(cfg->friction); | |
| for (int s = 0; s < SPH_PER_CHN; ++s) { | |
| __m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]); | |
| __m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]); | |
| __m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]); | |
| __m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]); | |
| UNROLL_HINT | |
| for (int k = 0; k < 2; ++k) { | |
| __m256 dx = _mm256_sub_ps(p_pos[0][k], sx); | |
| __m256 dy = _mm256_sub_ps(p_pos[1][k], sy); | |
| __m256 dz = _mm256_sub_ps(p_pos[2][k], sz); | |
| __m256 d2 = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); | |
| // 1. Calculate safe distance | |
| __m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_v)); | |
| __m256 dist = _mm256_mul_ps(d2, id); | |
| // 2. Only calculate push if there is actual penetration and sr > 0 | |
| __m256 active_sph = _mm256_cmp_ps(sr, eps_v, _CMP_GT_OQ); | |
| __m256 colliding = _mm256_cmp_ps(dist, sr, _CMP_LT_OQ); | |
| __m256 mask = _mm256_and_ps(active_sph, colliding); | |
| __m256 pen = _mm256_sub_ps(sr, dist); | |
| __m256 push = _mm256_and_ps(pen, mask); | |
| // 3. Normal vector (zeroed if not colliding to prevent NaN propagation) | |
| __m256 nx = _mm256_mul_ps(dx, id); | |
| __m256 ny = _mm256_mul_ps(dy, id); | |
| __m256 nz = _mm256_mul_ps(dz, id); | |
| p_pos[0][k] = _mm256_add_ps(p_pos[0][k], _mm256_mul_ps(nx, push)); | |
| p_pos[1][k] = _mm256_add_ps(p_pos[1][k], _mm256_mul_ps(ny, push)); | |
| p_pos[2][k] = _mm256_add_ps(p_pos[2][k], _mm256_mul_ps(nz, push)); | |
| // 4. Friction (Stable Step-by-Step) | |
| __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]); | |
| __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]); | |
| __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]); | |
| __m256 dot = _mm256_add_ps(_mm256_mul_ps(vx, nx), _mm256_add_ps(_mm256_mul_ps(vy, ny), _mm256_mul_ps(vz, nz))); | |
| // Tangent Velocity = Total Velocity - Normal Velocity | |
| __m256 tx = _mm256_sub_ps(vx, _mm256_mul_ps(dot, nx)); | |
| __m256 ty = _mm256_sub_ps(vy, _mm256_mul_ps(dot, ny)); | |
| __m256 tz = _mm256_sub_ps(vz, _mm256_mul_ps(dot, nz)); | |
| __m256 f_scale = _mm256_mul_ps(push, f_coef); | |
| // Apply friction only where colliding | |
| p_pos[0][k] = _mm256_sub_ps(p_pos[0][k], _mm256_and_ps(_mm256_mul_ps(tx, f_scale), mask)); | |
| p_pos[1][k] = _mm256_sub_ps(p_pos[1][k], _mm256_and_ps(_mm256_mul_ps(ty, f_scale), mask)); | |
| p_pos[2][k] = _mm256_sub_ps(p_pos[2][k], _mm256_and_ps(_mm256_mul_ps(tz, f_scale), mask)); | |
| } | |
| } | |
| /* --- Motion Radius Guardrail --- */ | |
| UNROLL_HINT | |
| for (int k=0; k<2; ++k) { | |
| __m256 dx = _mm256_sub_ps(p_pos[0][k], r_pos[0][k]); | |
| __m256 dy = _mm256_sub_ps(p_pos[1][k], r_pos[1][k]); | |
| __m256 dz = _mm256_sub_ps(p_pos[2][k], r_pos[2][k]); | |
| __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz))); | |
| __m256 dist = _mm256_sqrt_ps(d2); | |
| __m256 mask = _mm256_cmp_ps(dist, m_rad[k], _CMP_GT_OQ); | |
| __m256 scale = _mm256_div_ps(m_rad[k], _mm256_max_ps(dist, eps_v)); | |
| p_pos[0][k] = _mm256_blendv_ps(p_pos[0][k], _mm256_fmadd_ps(dx, scale, r_pos[0][k]), mask); | |
| p_pos[1][k] = _mm256_blendv_ps(p_pos[1][k], _mm256_fmadd_ps(dy, scale, r_pos[1][k]), mask); | |
| p_pos[2][k] = _mm256_blendv_ps(p_pos[2][k], _mm256_fmadd_ps(dz, scale, r_pos[2][k]), mask); | |
| } | |
| /* --- Final Store --- */ | |
| UNROLL_HINT | |
| for (int k=0; k<2; ++k) { | |
| int off = p_base + k*8, r_off = r_base + k*8; | |
| _mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]); | |
| _mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]); | |
| _mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]); | |
| _mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]); | |
| _mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]); | |
| _mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]); | |
| _mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]); | |
| _mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]); | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "raylib.h" | |
| #include "raymath.h" | |
| #include <stdio.h> | |
| #include <math.h> | |
| static Vector3 | |
| GetParticleWorldPos(struct chn_sim* sim, int chain_id, int particle_idx) { | |
| int p_base = chain_id * PTC_PER_CHN; | |
| int phys_idx = (particle_idx % 2 == 0) ? | |
| (p_base + particle_idx/2) : | |
| (p_base + 8 + particle_idx/2); | |
| float lx = sim->ptc_pos_x[phys_idx]; | |
| float ly = sim->ptc_pos_y[phys_idx]; | |
| float lz = sim->ptc_pos_z[phys_idx]; | |
| // Manual Vector3 x Quaternion rotation (v' = v + 2 * cross(q.xyz, cross(q.xyz, v) + q.w * v)) | |
| float qx = sim->chn_quat_x[chain_id]; | |
| float qy = sim->chn_quat_y[chain_id]; | |
| float qz = sim->chn_quat_z[chain_id]; | |
| float qw = sim->chn_quat_w[chain_id]; | |
| float tx = 2.0f * (qy * lz - qz * ly); | |
| float ty = 2.0f * (qz * lx - qx * lz); | |
| float tz = 2.0f * (qx * ly - qy * lx); | |
| float rx = lx + qw * tx + (qy * tz - qz * ty); | |
| float ry = ly + qw * ty + (qz * tx - qx * tz); | |
| float rz = lz + qw * tz + (qx * ty - qy * tx); | |
| return (Vector3){ | |
| sim->chn_pos_x[chain_id] + rx, | |
| sim->chn_pos_y[chain_id] + ry, | |
| sim->chn_pos_z[chain_id] + rz | |
| }; | |
| } | |
| extern int | |
| main(void) { | |
| const int screenWidth = 1280; | |
| const int screenHeight = 720; | |
| InitWindow(screenWidth, screenHeight, "Enshrouded Chain Physics - Diagnostic"); | |
| SetTargetFPS(60); | |
| Camera3D camera = { 0 }; | |
| camera.position = (Vector3){ 0.0f, 2.0f, 5.0f }; | |
| camera.target = (Vector3){ 0.0f, 0.5f, 0.0f }; | |
| camera.up = (Vector3){ 0.0f, 1.0f, 0.0f }; | |
| camera.fovy = 45.0f; | |
| camera.projection = CAMERA_PERSPECTIVE; | |
| struct chn_sim sim; | |
| chn_sim_init(&sim); | |
| struct chn_cfg cfg = { | |
| .damping = 0.99f, | |
| .stiffness = 0.5f, | |
| .bend_stiffness = 10.0f, | |
| .friction = 0.5f, | |
| .linear_inertia = 0.1f, | |
| .rest_curvature = 0.0f | |
| }; | |
| // Create chain | |
| float local_rest_pos[16 * 4]; | |
| for (int i = 0; i < 16; ++i) { | |
| local_rest_pos[i*4 + 0] = 0.0f; | |
| local_rest_pos[i*4 + 1] = -(0.15f * i); // Offset downwards from root | |
| local_rest_pos[i*4 + 2] = 0.0f; | |
| local_rest_pos[i*4 + 3] = (i==0) ? 0.0f : 1.0f; // Root fixed at 0,0,0 local | |
| } | |
| chn_sim_add(&sim, &cfg, local_rest_pos, 16); | |
| float grav[] = { 0.0f, -9.81f, 0.0f }; | |
| chn_sim_grav(&sim, 0, grav); | |
| // Set initial world location | |
| float p_init[] = { 0.0f, 2.0f, 0.0f }; // Start at height 2.0 | |
| float r_init[] = { 0, 0, 0, 1 }; | |
| chn_sim_tel(&sim, 0, p_init, r_init); | |
| Vector3 rootPos = {0.0f, 2.0f, 0.0f}; | |
| Vector3 spherePos = { 0.0f, 0.8f, 0.2f }; | |
| float sphereRadius = 0.7f; | |
| while (!WindowShouldClose()) { | |
| if (IsKeyPressed(KEY_Q)) { | |
| break; | |
| } | |
| float dt = 0.016f; // Fixed DT for stable debugging | |
| if (IsKeyDown(KEY_UP)) { | |
| rootPos.z -= 4.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_DOWN)) { | |
| rootPos.z += 4.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_LEFT)) { | |
| rootPos.x -= 4.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_RIGHT)) { | |
| rootPos.x += 4.0f * dt; | |
| } | |
| // Move Sphere | |
| if (IsKeyDown(KEY_W)) { | |
| spherePos.z -= 2.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_S)) { | |
| spherePos.z += 2.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_A)) { | |
| spherePos.x -= 2.0f * dt; | |
| } | |
| if (IsKeyDown(KEY_D)){ | |
| spherePos.x += 2.0f * dt; | |
| } | |
| UpdateCamera(&camera, CAMERA_ORBITAL); | |
| // 1. Move the World Space Root and pass movement to the simulator | |
| rootPos.x += sinf(GetTime() * 2.0f) * 0.005f; | |
| float p[] = { rootPos.x, rootPos.y, rootPos.z }; | |
| float r[] = { 0, 0, 0, 1 }; // Identity rotation for now | |
| chn_sim_mov(&sim, 0, p, r); | |
| // 3. Update Colliders (MUST be in Local Space relative to rootPos) | |
| float local_sph[] = { | |
| spherePos.x - rootPos.x, | |
| spherePos.y - rootPos.y, | |
| spherePos.z - rootPos.z, | |
| sphereRadius | |
| }; | |
| chn_sim_set_sph(&sim, 0, local_sph, 1); | |
| chn_sim_run(&sim, dt); | |
| BeginDrawing(); | |
| ClearBackground(DARKGRAY); | |
| BeginMode3D(camera); | |
| DrawGrid(10, 1.0f); | |
| DrawSphere(spherePos, sphereRadius, Fade(RED, 0.3f)); | |
| for(int i=0; i<15; i++) { | |
| Vector3 p1 = GetParticleWorldPos(&sim, 0, i); | |
| Vector3 p2 = GetParticleWorldPos(&sim, 0, i+1); | |
| DrawCapsule(p1, p2, 0.02f, 4, 4, GOLD); | |
| } | |
| EndMode3D(); | |
| DrawFPS(10, 10); | |
| EndDrawing(); | |
| } | |
| CloseWindow(); | |
| return 0; | |
| } |
Author
vurtun
commented
Dec 19, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment