93 __m256 dt_vec = _mm256_set1_ps(dt);
94 __m256 zero_vec = _mm256_setzero_ps();
95 __m256 bounce_factor = _mm256_set1_ps(0.8f);
99 for (
size_t i = 0; i < simd_count; i += 8) {
101 __m256 vx = _mm256_loadu_ps(&particles.
vx[i]);
102 __m256 vy = _mm256_loadu_ps(&particles.
vy[i]);
103 __m256 vz = _mm256_loadu_ps(&particles.
vz[i]);
105 __m256 ax = _mm256_loadu_ps(&particles.
ax[i]);
106 __m256 ay = _mm256_loadu_ps(&particles.
ay[i]);
107 __m256 az = _mm256_loadu_ps(&particles.
az[i]);
109 __m256 px = _mm256_loadu_ps(&particles.
x[i]);
110 __m256 py = _mm256_loadu_ps(&particles.
y[i]);
111 __m256 pz = _mm256_loadu_ps(&particles.
z[i]);
114 vx = _mm256_fmadd_ps(ax, dt_vec, vx);
115 vy = _mm256_fmadd_ps(ay, dt_vec, vy);
116 vz = _mm256_fmadd_ps(az, dt_vec, vz);
119 px = _mm256_fmadd_ps(vx, dt_vec, px);
120 py = _mm256_fmadd_ps(vy, dt_vec, py);
121 pz = _mm256_fmadd_ps(vz, dt_vec, pz);
124 __m256 below_ground = _mm256_cmp_ps(pz, zero_vec, _CMP_LT_OQ);
125 pz = _mm256_max_ps(pz, zero_vec);
128 __m256 neg_vz = _mm256_mul_ps(vz, _mm256_set1_ps(-1.0f));
129 __m256 damped_vz = _mm256_mul_ps(neg_vz, bounce_factor);
130 vz = _mm256_blendv_ps(vz, damped_vz, below_ground);
133 _mm256_storeu_ps(&particles.
vx[i], vx);
134 _mm256_storeu_ps(&particles.
vy[i], vy);
135 _mm256_storeu_ps(&particles.
vz[i], vz);
137 _mm256_storeu_ps(&particles.
x[i], px);
138 _mm256_storeu_ps(&particles.
y[i], py);
139 _mm256_storeu_ps(&particles.
z[i], pz);
144 particles.
vx[i] += particles.
ax[i] * dt;
145 particles.
vy[i] += particles.
ay[i] * dt;
146 particles.
vz[i] += particles.
az[i] * dt;
148 particles.
x[i] += particles.
vx[i] * dt;
149 particles.
y[i] += particles.
vy[i] * dt;
150 particles.
z[i] += particles.
vz[i] * dt;
152 if (particles.
z[i] < 0.0f) {
153 particles.
z[i] = 0.0f;
154 particles.
vz[i] = -particles.
vz[i] * 0.8f;
181 for (
size_t i = 0; i < simd_count; i += 8) {
182 __m256 vx = _mm256_loadu_ps(&particles.
vx[i]);
183 __m256 vy = _mm256_loadu_ps(&particles.
vy[i]);
184 __m256 vz = _mm256_loadu_ps(&particles.
vz[i]);
186 __m256 vx2 = _mm256_mul_ps(vx, vx);
187 __m256 vy2 = _mm256_mul_ps(vy, vy);
188 __m256 vz2 = _mm256_mul_ps(vz, vz);
190 __m256 v2 = _mm256_add_ps(vx2, _mm256_add_ps(vy2, vz2));
191 _mm256_storeu_ps(&v_squared[i], v2);
196 v_squared[i] = particles.
vx[i] * particles.
vx[i] +
197 particles.
vy[i] * particles.
vy[i] +
198 particles.
vz[i] * particles.
vz[i];
212 posix_shm shm(
"simd_simulation", shm_size);
214 std::cout <<
"=== SIMD Particle Simulation ===" << std::endl;
216 std::cout <<
"Iterations: " <<
ITERATIONS << std::endl;
217 std::cout << std::endl;
223 const float dt = 0.01f;
226 auto start = high_resolution_clock::now();
231 auto end = high_resolution_clock::now();
232 auto scalar_time = duration_cast<microseconds>(end - start).count();
238 start = high_resolution_clock::now();
243 end = high_resolution_clock::now();
244 auto simd_time = duration_cast<microseconds>(end - start).count();
247 std::cout <<
"Performance Results:" << std::endl;
248 std::cout <<
"-------------------" << std::endl;
249 std::cout << std::fixed << std::setprecision(2);
250 std::cout <<
"Scalar version: " << scalar_time <<
" µs" << std::endl;
251 std::cout <<
"SIMD version: " << simd_time <<
" µs" << std::endl;
252 std::cout <<
"Speedup: " << (float)scalar_time / simd_time <<
"x" << std::endl;
253 std::cout << std::endl;
255 std::cout <<
"Final kinetic energy:" << std::endl;
256 std::cout <<
"Scalar: " << ke_scalar <<
" J" << std::endl;
257 std::cout <<
"SIMD: " << ke_simd <<
" J" << std::endl;
258 std::cout <<
"Difference: " << std::abs(ke_scalar - ke_simd) <<
" J" << std::endl;
259 std::cout << std::endl;
262 std::cout <<
"SIMD Helper Functions:" << std::endl;
263 std::cout <<
"---------------------" << std::endl;
266 std::cout <<
"Min velocity X: " << vx_simd.
min() <<
" m/s" << std::endl;
267 std::cout <<
"Max velocity X: " << vx_simd.
max() <<
" m/s" << std::endl;
268 std::cout <<
"Sum velocity X: " << vx_simd.
sum() <<
" m/s" << std::endl;
271 std::cout << std::endl;
272 std::cout <<
"Memory Alignment:" << std::endl;
273 std::cout <<
"----------------" << std::endl;
274 std::cout <<
"Position X aligned to 32: "
276 std::cout <<
"Velocity X aligned to 32: "
281 }
catch (
const std::exception& e) {
282 std::cerr <<
"Error: " << e.what() << std::endl;
ParticleSystemSoA(posix_shm &shm, size_t count)