28inline bool is_aligned(
const void* ptr,
size_t alignment)
noexcept {
29 return (
reinterpret_cast<uintptr_t
>(ptr) & (alignment - 1)) == 0;
47inline float sum_floats(
const float* data,
size_t count)
noexcept {
51 for (
size_t i = 0; i < count; ++i) {
57 __m256 sum_vec = _mm256_setzero_ps();
58 size_t simd_count = count & ~7;
61 for (
size_t i = 0; i < simd_count; i += 8) {
62 __m256 vec = _mm256_loadu_ps(&data[i]);
63 sum_vec = _mm256_add_ps(sum_vec, vec);
67 __m128 low = _mm256_castps256_ps128(sum_vec);
68 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
69 __m128 sum128 = _mm_add_ps(low, high);
70 sum128 = _mm_hadd_ps(sum128, sum128);
71 sum128 = _mm_hadd_ps(sum128, sum128);
73 float sum = _mm_cvtss_f32(sum128);
76 for (
size_t i = simd_count; i < count; ++i) {
94inline float dot_product(
const float* a,
const float* b,
size_t count)
noexcept {
95 __m256 sum_vec = _mm256_setzero_ps();
96 size_t simd_count = count & ~7;
98 for (
size_t i = 0; i < simd_count; i += 8) {
99 __m256 vec_a = _mm256_loadu_ps(&a[i]);
100 __m256 vec_b = _mm256_loadu_ps(&b[i]);
101 __m256 prod = _mm256_mul_ps(vec_a, vec_b);
102 sum_vec = _mm256_add_ps(sum_vec, prod);
106 __m128 low = _mm256_castps256_ps128(sum_vec);
107 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
108 __m128 sum128 = _mm_add_ps(low, high);
109 sum128 = _mm_hadd_ps(sum128, sum128);
110 sum128 = _mm_hadd_ps(sum128, sum128);
112 float sum = _mm_cvtss_f32(sum128);
115 for (
size_t i = simd_count; i < count; ++i) {
135inline void scale_floats(
float* data,
size_t count,
float scale)
noexcept {
136 __m256 scale_vec = _mm256_set1_ps(scale);
137 size_t simd_count = count & ~7;
139 for (
size_t i = 0; i < simd_count; i += 8) {
140 __m256 vec = _mm256_loadu_ps(&data[i]);
141 vec = _mm256_mul_ps(vec, scale_vec);
142 _mm256_storeu_ps(&data[i], vec);
146 for (
size_t i = simd_count; i < count; ++i) {
162inline void fma_floats(
const float* a,
const float* b,
const float* c,
163 float* result,
size_t count)
noexcept {
164 size_t simd_count = count & ~7;
166 for (
size_t i = 0; i < simd_count; i += 8) {
167 __m256 vec_a = _mm256_loadu_ps(&a[i]);
168 __m256 vec_b = _mm256_loadu_ps(&b[i]);
169 __m256 vec_c = _mm256_loadu_ps(&c[i]);
170 __m256 res = _mm256_fmadd_ps(vec_a, vec_b, vec_c);
171 _mm256_storeu_ps(&result[i], res);
175 for (
size_t i = simd_count; i < count; ++i) {
176 result[i] = a[i] * b[i] + c[i];
187inline float min_float(
const float* data,
size_t count)
noexcept {
188 if (count == 0)
return 0.0f;
190 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
191 size_t simd_count = count & ~7;
193 for (
size_t i = 0; i < simd_count; i += 8) {
194 __m256 vec = _mm256_loadu_ps(&data[i]);
195 min_vec = _mm256_min_ps(min_vec, vec);
200 _mm256_storeu_ps(mins, min_vec);
201 float min_val = *std::min_element(mins, mins + 8);
204 for (
size_t i = simd_count; i < count; ++i) {
205 min_val = std::min(min_val, data[i]);
218inline float max_float(
const float* data,
size_t count)
noexcept {
219 if (count == 0)
return 0.0f;
221 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
222 size_t simd_count = count & ~7;
224 for (
size_t i = 0; i < simd_count; i += 8) {
225 __m256 vec = _mm256_loadu_ps(&data[i]);
226 max_vec = _mm256_max_ps(max_vec, vec);
231 _mm256_storeu_ps(maxs, max_vec);
232 float max_val = *std::max_element(maxs, maxs + 8);
235 for (
size_t i = simd_count; i < count; ++i) {
236 max_val = std::max(max_val, data[i]);
250template<
int distance = 1>
252 _mm_prefetch(
static_cast<const char*
>(ptr), distance);
263 __builtin_prefetch(ptr, 1, 1);
276 size_t simd_count = count & ~7;
280 for (
size_t i = 0; i < simd_count; i += 8) {
281 __m256 vec = _mm256_loadu_ps(&src[i]);
282 _mm256_stream_ps(&dest[i], vec);
287 for (
size_t i = 0; i < simd_count; i += 8) {
288 __m256 vec = _mm256_loadu_ps(&src[i]);
289 _mm256_storeu_ps(&dest[i], vec);
294 for (
size_t i = simd_count; i < count; ++i) {
312template<
typename TableType = shm_table>
320 float sum() const noexcept {
324 float min() const noexcept {
328 float max() const noexcept {
337 size_t min_size = std::min(arr.
size(), other.size());
Fixed-size array in shared memory with zero-overhead access.
size_t size() const noexcept
Get number of elements.
pointer data() noexcept
Get pointer to underlying data.
Helper class for SIMD operations on shm_array.
void scale(float factor) noexcept
float max() const noexcept
float min() const noexcept
float sum() const noexcept
SimdArray(shm_array< float, TableType > &array)
float dot(const shm_array< float, TableType > &other) const noexcept
float max_float(const float *data, size_t count) noexcept
Find maximum value in float array.
void stream_store_floats(float *dest, const float *src, size_t count) noexcept
Stream store (bypass cache) for large arrays.
float min_float(const float *data, size_t count) noexcept
Find minimum value in float array.
void prefetch_read(const void *ptr) noexcept
Prefetch data for read.
float sum_floats(const float *data, size_t count) noexcept
Vectorized sum of float array using AVX2.
bool is_aligned(const void *ptr, size_t alignment) noexcept
Check if pointer is aligned to boundary.
float dot_product(const float *a, const float *b, size_t count) noexcept
Vectorized dot product of two float arrays.
void prefetch_write(void *ptr) noexcept
Prefetch data for write.
void fma_floats(const float *a, const float *b, const float *c, float *result, size_t count) noexcept
Vectorized FMA operation: result = a * b + c.
void scale_floats(float *data, size_t count, float scale) noexcept
Vectorized array scaling (multiply by scalar)
Fixed-size shared memory array with STL compatibility.