POSIX Shared Memory Data Structures 1.0
High-performance lock-free data structures for inter-process communication
Loading...
Searching...
No Matches
shm_simd_utils.h
Go to the documentation of this file.
1
12#pragma once
13#include <immintrin.h>
14#include <cstddef>
15#include <span>
16#include <algorithm>
17#include "shm_array.h"
18
19namespace shm_simd {
20
28inline bool is_aligned(const void* ptr, size_t alignment) noexcept {
29 return (reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) == 0;
30}
31
47inline float sum_floats(const float* data, size_t count) noexcept {
48 if (count < 8) {
49 // Scalar fallback for small arrays
50 float sum = 0.0f;
51 for (size_t i = 0; i < count; ++i) {
52 sum += data[i];
53 }
54 return sum;
55 }
56
57 __m256 sum_vec = _mm256_setzero_ps();
58 size_t simd_count = count & ~7; // Round down to multiple of 8
59
60 // Main SIMD loop
61 for (size_t i = 0; i < simd_count; i += 8) {
62 __m256 vec = _mm256_loadu_ps(&data[i]);
63 sum_vec = _mm256_add_ps(sum_vec, vec);
64 }
65
66 // Horizontal sum of vector
67 __m128 low = _mm256_castps256_ps128(sum_vec);
68 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
69 __m128 sum128 = _mm_add_ps(low, high);
70 sum128 = _mm_hadd_ps(sum128, sum128);
71 sum128 = _mm_hadd_ps(sum128, sum128);
72
73 float sum = _mm_cvtss_f32(sum128);
74
75 // Handle remaining elements
76 for (size_t i = simd_count; i < count; ++i) {
77 sum += data[i];
78 }
79
80 return sum;
81}
82
94inline float dot_product(const float* a, const float* b, size_t count) noexcept {
95 __m256 sum_vec = _mm256_setzero_ps();
96 size_t simd_count = count & ~7;
97
98 for (size_t i = 0; i < simd_count; i += 8) {
99 __m256 vec_a = _mm256_loadu_ps(&a[i]);
100 __m256 vec_b = _mm256_loadu_ps(&b[i]);
101 __m256 prod = _mm256_mul_ps(vec_a, vec_b);
102 sum_vec = _mm256_add_ps(sum_vec, prod);
103 }
104
105 // Horizontal sum
106 __m128 low = _mm256_castps256_ps128(sum_vec);
107 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
108 __m128 sum128 = _mm_add_ps(low, high);
109 sum128 = _mm_hadd_ps(sum128, sum128);
110 sum128 = _mm_hadd_ps(sum128, sum128);
111
112 float sum = _mm_cvtss_f32(sum128);
113
114 // Scalar remainder
115 for (size_t i = simd_count; i < count; ++i) {
116 sum += a[i] * b[i];
117 }
118
119 return sum;
120}
121
135inline void scale_floats(float* data, size_t count, float scale) noexcept {
136 __m256 scale_vec = _mm256_set1_ps(scale);
137 size_t simd_count = count & ~7;
138
139 for (size_t i = 0; i < simd_count; i += 8) {
140 __m256 vec = _mm256_loadu_ps(&data[i]);
141 vec = _mm256_mul_ps(vec, scale_vec);
142 _mm256_storeu_ps(&data[i], vec);
143 }
144
145 // Scalar remainder
146 for (size_t i = simd_count; i < count; ++i) {
147 data[i] *= scale;
148 }
149}
150
162inline void fma_floats(const float* a, const float* b, const float* c,
163 float* result, size_t count) noexcept {
164 size_t simd_count = count & ~7;
165
166 for (size_t i = 0; i < simd_count; i += 8) {
167 __m256 vec_a = _mm256_loadu_ps(&a[i]);
168 __m256 vec_b = _mm256_loadu_ps(&b[i]);
169 __m256 vec_c = _mm256_loadu_ps(&c[i]);
170 __m256 res = _mm256_fmadd_ps(vec_a, vec_b, vec_c);
171 _mm256_storeu_ps(&result[i], res);
172 }
173
174 // Scalar remainder
175 for (size_t i = simd_count; i < count; ++i) {
176 result[i] = a[i] * b[i] + c[i];
177 }
178}
179
187inline float min_float(const float* data, size_t count) noexcept {
188 if (count == 0) return 0.0f;
189
190 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
191 size_t simd_count = count & ~7;
192
193 for (size_t i = 0; i < simd_count; i += 8) {
194 __m256 vec = _mm256_loadu_ps(&data[i]);
195 min_vec = _mm256_min_ps(min_vec, vec);
196 }
197
198 // Extract minimum from vector
199 float mins[8];
200 _mm256_storeu_ps(mins, min_vec);
201 float min_val = *std::min_element(mins, mins + 8);
202
203 // Check remainder
204 for (size_t i = simd_count; i < count; ++i) {
205 min_val = std::min(min_val, data[i]);
206 }
207
208 return min_val;
209}
210
218inline float max_float(const float* data, size_t count) noexcept {
219 if (count == 0) return 0.0f;
220
221 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
222 size_t simd_count = count & ~7;
223
224 for (size_t i = 0; i < simd_count; i += 8) {
225 __m256 vec = _mm256_loadu_ps(&data[i]);
226 max_vec = _mm256_max_ps(max_vec, vec);
227 }
228
229 // Extract maximum from vector
230 float maxs[8];
231 _mm256_storeu_ps(maxs, max_vec);
232 float max_val = *std::max_element(maxs, maxs + 8);
233
234 // Check remainder
235 for (size_t i = simd_count; i < count; ++i) {
236 max_val = std::max(max_val, data[i]);
237 }
238
239 return max_val;
240}
241
250template<int distance = 1>
251inline void prefetch_read(const void* ptr) noexcept {
252 _mm_prefetch(static_cast<const char*>(ptr), distance);
253}
254
262inline void prefetch_write(void* ptr) noexcept {
263 __builtin_prefetch(ptr, 1, 1);
264}
265
275inline void stream_store_floats(float* dest, const float* src, size_t count) noexcept {
276 size_t simd_count = count & ~7;
277
278 // Ensure aligned for stream stores
279 if (is_aligned(dest, 32)) {
280 for (size_t i = 0; i < simd_count; i += 8) {
281 __m256 vec = _mm256_loadu_ps(&src[i]);
282 _mm256_stream_ps(&dest[i], vec);
283 }
284 _mm_sfence(); // Ensure stores complete
285 } else {
286 // Fallback to regular stores
287 for (size_t i = 0; i < simd_count; i += 8) {
288 __m256 vec = _mm256_loadu_ps(&src[i]);
289 _mm256_storeu_ps(&dest[i], vec);
290 }
291 }
292
293 // Handle remainder
294 for (size_t i = simd_count; i < count; ++i) {
295 dest[i] = src[i];
296 }
297}
298
312template<typename TableType = shm_table>
314private:
316
317public:
318 explicit SimdArray(shm_array<float, TableType>& array) : arr(array) {}
319
320 float sum() const noexcept {
321 return sum_floats(arr.data(), arr.size());
322 }
323
324 float min() const noexcept {
325 return min_float(arr.data(), arr.size());
326 }
327
328 float max() const noexcept {
329 return max_float(arr.data(), arr.size());
330 }
331
332 void scale(float factor) noexcept {
333 scale_floats(arr.data(), arr.size(), factor);
334 }
335
336 float dot(const shm_array<float, TableType>& other) const noexcept {
337 size_t min_size = std::min(arr.size(), other.size());
338 return dot_product(arr.data(), other.data(), min_size);
339 }
340};
341
342} // namespace shm_simd
Fixed-size array in shared memory with zero-overhead access.
Definition shm_array.h:63
size_t size() const noexcept
Get number of elements.
Definition shm_array.h:221
pointer data() noexcept
Get pointer to underlying data.
Definition shm_array.h:333
Helper class for SIMD operations on shm_array.
void scale(float factor) noexcept
float max() const noexcept
float min() const noexcept
float sum() const noexcept
SimdArray(shm_array< float, TableType > &array)
float dot(const shm_array< float, TableType > &other) const noexcept
float max_float(const float *data, size_t count) noexcept
Find maximum value in float array.
void stream_store_floats(float *dest, const float *src, size_t count) noexcept
Stream store (bypass cache) for large arrays.
float min_float(const float *data, size_t count) noexcept
Find minimum value in float array.
void prefetch_read(const void *ptr) noexcept
Prefetch data for read.
float sum_floats(const float *data, size_t count) noexcept
Vectorized sum of float array using AVX2.
bool is_aligned(const void *ptr, size_t alignment) noexcept
Check if pointer is aligned to boundary.
float dot_product(const float *a, const float *b, size_t count) noexcept
Vectorized dot product of two float arrays.
void prefetch_write(void *ptr) noexcept
Prefetch data for write.
void fma_floats(const float *a, const float *b, const float *c, float *result, size_t count) noexcept
Vectorized FMA operation: result = a * b + c.
void scale_floats(float *data, size_t count, float scale) noexcept
Vectorized array scaling (multiply by scalar)
Fixed-size shared memory array with STL compatibility.