Vital
Loading...
Searching...
No Matches
poly_values.h
Go to the documentation of this file.
1#pragma once
2
3#include <cstdint>
4#include <climits>
5#include <cstdlib>
6
19#if VITAL_AVX2
20 #define VITAL_AVX2 1
21 static_assert(false, "AVX2 is not supported yet.");
22#elif __SSE2__
23 #define VITAL_SSE2 1
24#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
25 #define VITAL_NEON 1
26#else
27 static_assert(false, "No SIMD Intrinsics found which are necessary for compilation");
28#endif
29
30#if VITAL_SSE2
31 #include <immintrin.h>
32#elif VITAL_NEON
33 #include <arm_neon.h>
34#endif
35
36#if !defined(force_inline)
37#if defined (_MSC_VER)
38#define force_inline __forceinline
39 #define vector_call __vectorcall
40#else
41 #define force_inline inline __attribute__((always_inline))
42 #define vector_call
43#endif
44#endif
45
46namespace vital {
47
56 struct poly_int {
57#if VITAL_AVX2
58 static constexpr size_t kSize = 8;
59 typedef __m256i simd_type;
60#elif VITAL_SSE2
61 static constexpr size_t kSize = 4;
62 typedef __m128i simd_type;
63#elif VITAL_NEON
64 static constexpr size_t kSize = 4;
65 typedef uint32x4_t simd_type;
66#endif
67
73 int32_t scalar[kSize];
74 simd_type simd;
75 };
76
82 simd_type simd;
83 int32_t scalar[kSize];
84 };
85
86 static constexpr uint32_t kFullMask = (unsigned int)-1;
87 static constexpr uint32_t kSignMask = 0x80000000;
88 static constexpr uint32_t kNotSignMask = kFullMask ^ kSignMask;
89
95 static force_inline simd_type vector_call init(uint32_t scalar) {
96#if VITAL_AVX2
97 return _mm256_set1_epi32((int32_t)scalar);
98#elif VITAL_SSE2
99 return _mm_set1_epi32((int32_t)scalar);
100#elif VITAL_NEON
101 return vdupq_n_u32(scalar);
102#endif
103 }
104
110 static force_inline simd_type vector_call load(const uint32_t* memory) {
111#if VITAL_AVX2
112 return _mm256_loadu_si256((const __m256i*)scalar);
113#elif VITAL_SSE2
114 return _mm_loadu_si128((const __m128i*)memory);
115#elif VITAL_NEON
116 return vld1q_u32(memory);
117#endif
118 }
119
126 static force_inline simd_type vector_call add(simd_type one, simd_type two) {
127#if VITAL_AVX2
128 return _mm256_add_epi32(one, two);
129#elif VITAL_SSE2
130 return _mm_add_epi32(one, two);
131#elif VITAL_NEON
132 return vaddq_u32(one, two);
133#endif
134 }
135
142 static force_inline simd_type vector_call sub(simd_type one, simd_type two) {
143#if VITAL_AVX2
144 return _mm256_sub_epi32(one, two);
145#elif VITAL_SSE2
146 return _mm_sub_epi32(one, two);
147#elif VITAL_NEON
148 return vsubq_u32(one, two);
149#endif
150 }
151
157 static force_inline simd_type vector_call neg(simd_type value) {
158#if VITAL_AVX2
159 return _mm256_sub_epi32(_mm256_set1_epi32(0), value);
160#elif VITAL_SSE2
161 return _mm_sub_epi32(_mm_set1_epi32(0), value);
162#elif VITAL_NEON
163 return vmulq_n_u32(value, -1);
164#endif
165 }
166
173 static force_inline simd_type vector_call mul(simd_type one, simd_type two) {
174#if VITAL_AVX2
175 return _mm256_mul_epi32(one, two);
176#elif VITAL_SSE2
177 // SSE2 does not have a direct epi32 multiply, so we emulate it:
178 simd_type mul0_2 = _mm_mul_epu32(one, two);
179 simd_type mul1_3 = _mm_mul_epu32(
180 _mm_shuffle_epi32(one, _MM_SHUFFLE(2, 3, 0, 1)),
181 _mm_shuffle_epi32(two, _MM_SHUFFLE(2, 3, 0, 1)));
182 return _mm_unpacklo_epi32(
183 _mm_shuffle_epi32(mul0_2, _MM_SHUFFLE (0, 0, 2, 0)),
184 _mm_shuffle_epi32(mul1_3, _MM_SHUFFLE (0, 0, 2, 0)));
185#elif VITAL_NEON
186 return vmulq_u32(one, two);
187#endif
188 }
189
196 static force_inline simd_type vector_call bitAnd(simd_type value, simd_type mask) {
197#if VITAL_AVX2
198 return _mm256_and_si256(value, mask);
199#elif VITAL_SSE2
200 return _mm_and_si128(value, mask);
201#elif VITAL_NEON
202 return vandq_u32(value, mask);
203#endif
204 }
205
212 static force_inline simd_type vector_call bitOr(simd_type value, simd_type mask) {
213#if VITAL_AVX2
214 return _mm256_or_si256(value, mask);
215#elif VITAL_SSE2
216 return _mm_or_si128(value, mask);
217#elif VITAL_NEON
218 return vorrq_u32(value, mask);
219#endif
220 }
221
228 static force_inline simd_type vector_call bitXor(simd_type value, simd_type mask) {
229#if VITAL_AVX2
230 return _mm256_xor_si256(value, mask);
231#elif VITAL_SSE2
232 return _mm_xor_si128(value, mask);
233#elif VITAL_NEON
234 return veorq_u32(value, mask);
235#endif
236 }
237
243 static force_inline simd_type vector_call bitNot(simd_type value) {
244 return bitXor(value, init(-1));
245 }
246
253 static force_inline simd_type vector_call max(simd_type one, simd_type two) {
254#if VITAL_AVX2
255 return _mm256_max_epi32(one, two);
256#elif VITAL_SSE2
257 simd_type greater_than_mask = greaterThan(one, two);
258 // Choose 'one' where mask is set, else 'two'
259 return _mm_or_si128(_mm_and_si128(greater_than_mask, one),
260 _mm_andnot_si128(greater_than_mask, two));
261#elif VITAL_NEON
262 return vmaxq_u32(one, two);
263#endif
264 }
265
272 static force_inline simd_type vector_call min(simd_type one, simd_type two) {
273#if VITAL_AVX2
274 return _mm256_min_epi32(one, two);
275#elif VITAL_SSE2
276 simd_type less_than_mask = _mm_cmpgt_epi32(two, one);
277 // Choose 'one' where mask is set, else 'two'
278 return _mm_or_si128(_mm_and_si128(less_than_mask, one),
279 _mm_andnot_si128(less_than_mask, two));
280#elif VITAL_NEON
281 return vminq_u32(one, two);
282#endif
283 }
284
291 static force_inline simd_type vector_call equal(simd_type one, simd_type two) {
292#if VITAL_AVX2
293 return _mm256_cmpeq_epi32(one, two);
294#elif VITAL_SSE2
295 return _mm_cmpeq_epi32(one, two);
296#elif VITAL_NEON
297 return vceqq_u32(one, two);
298#endif
299 }
300
309 static force_inline simd_type vector_call greaterThan(simd_type one, simd_type two) {
310#if VITAL_AVX2
311 return _mm256_cmpgt_epi32(_mm256_xor_si256(one, init(kSignMask)),
312 _mm256_xor_si256(two, init(kSignMask)));
313#elif VITAL_SSE2
314 return _mm_cmpgt_epi32(_mm_xor_si128(one, init(kSignMask)),
315 _mm_xor_si128(two, init(kSignMask)));
316#elif VITAL_NEON
317 return vcgtq_u32(one, two);
318#endif
319 }
320
326 static force_inline uint32_t vector_call sum(simd_type value) {
327#if VITAL_AVX2
328 // Example logic (not fully implemented):
329 // simd_type flip = _mm256_permute4x64_epi64(value, _MM_SHUFFLE(1, 0, 3, 2));
330 // ...
331 // return ...
332 // Implementation incomplete in this code snippet.
333 #error "AVX2 version not fully implemented in code snippet"
334#elif VITAL_SSE2
335 simd_scalar_union union_value { value };
336 uint32_t total = 0;
337 for (int i = 0; i < kSize; ++i)
338 total += union_value.scalar[i];
339 return total;
340#elif VITAL_NEON
341 uint32x2_t partial_sum = vpadd_u32(vget_low_u32(value), vget_high_u32(value));
342 partial_sum = vpadd_u32(partial_sum, partial_sum);
343 return vget_lane_u32(partial_sum, 0);
344#endif
345 }
346
352 static force_inline uint32_t vector_call anyMask(simd_type value) {
353#if VITAL_AVX2
354 return _mm256_movemask_epi8(value);
355#elif VITAL_SSE2
356 return _mm_movemask_epi8(value);
357#elif VITAL_NEON
358 // For NEON, we typically reduce it down:
359 uint32x2_t max_vals = vpmax_u32(vget_low_u32(value), vget_high_u32(value));
360 max_vals = vpmax_u32(max_vals, max_vals);
361 return vget_lane_u32(max_vals, 0);
362#endif
363 }
364
367 return max(one.value, two.value);
368 }
370 return min(one.value, two.value);
371 }
373 return equal(one.value, two.value);
374 }
376 return greaterThan(one.value, two.value);
377 }
379 return greaterThan(two.value, one.value);
380 }
382 return sum(value.value);
383 }
384
385 simd_type value;
386
390 force_inline poly_int() noexcept { value = init(0); }
391
396 force_inline poly_int(simd_type initial_value) noexcept : value(initial_value) { }
397
402 force_inline poly_int(uint32_t initial_value) noexcept {
403 value = init(initial_value);
404 }
405
413 force_inline poly_int(uint32_t first, uint32_t second, uint32_t third, uint32_t fourth) noexcept {
414 scalar_simd_union union_value { (int32_t)first, (int32_t)second, (int32_t)third, (int32_t)fourth };
415 value = union_value.simd;
416 }
417
423 force_inline poly_int(uint32_t first, uint32_t second) noexcept
424 : poly_int(first, second, first, second) { }
425
429 force_inline ~poly_int() noexcept { }
430
436 force_inline uint32_t vector_call access(size_t index) const noexcept {
437#if VITAL_AVX2
438 simd_union union_value { value };
439 return union_value.scalar[index];
440#elif VITAL_SSE2
441 simd_scalar_union union_value { value };
442 return union_value.scalar[index];
443#elif VITAL_NEON
444 return value[index];
445#endif
446 }
447
453 force_inline void vector_call set(size_t index, uint32_t new_value) noexcept {
454#if VITAL_AVX2
455 simd_union union_value { value };
456 union_value.scalar[index] = new_value;
457 value = union_value.simd;
458#elif VITAL_SSE2
459 simd_scalar_union union_value { value };
460 union_value.scalar[index] = (int32_t)new_value;
461 value = union_value.simd;
462#elif VITAL_NEON
463 value[index] = new_value;
464#endif
465 }
466
472 force_inline uint32_t vector_call operator[](size_t index) const noexcept {
473 return access(index);
474 }
475
478 value = add(value, other.value);
479 return *this;
480 }
482 value = sub(value, other.value);
483 return *this;
484 }
486 value = mul(value, other.value);
487 return *this;
488 }
490 value = bitAnd(value, other.value);
491 return *this;
492 }
494 value = bitOr(value, other.value);
495 return *this;
496 }
498 value = bitXor(value, other.value);
499 return *this;
500 }
501
503 force_inline poly_int& vector_call operator+=(simd_type other) noexcept {
504 value = add(value, other);
505 return *this;
506 }
507 force_inline poly_int& vector_call operator-=(simd_type other) noexcept {
508 value = sub(value, other);
509 return *this;
510 }
511 force_inline poly_int& vector_call operator*=(simd_type other) noexcept {
512 value = mul(value, other);
513 return *this;
514 }
515 force_inline poly_int& vector_call operator&=(simd_type other) noexcept {
516 value = bitAnd(value, other);
517 return *this;
518 }
519 force_inline poly_int& vector_call operator|=(simd_type other) noexcept {
520 value = bitOr(value, other);
521 return *this;
522 }
523 force_inline poly_int& vector_call operator^=(simd_type other) noexcept {
524 value = bitXor(value, other);
525 return *this;
526 }
527
529 force_inline poly_int& vector_call operator+=(uint32_t scalar) noexcept {
530 value = add(value, init(scalar));
531 return *this;
532 }
533 force_inline poly_int& vector_call operator-=(uint32_t scalar) noexcept {
534 value = sub(value, init(scalar));
535 return *this;
536 }
537 force_inline poly_int& vector_call operator*=(uint32_t scalar) noexcept {
538 value = mul(value, init(scalar));
539 return *this;
540 }
541
544 return add(value, other.value);
545 }
547 return sub(value, other.value);
548 }
550 return mul(value, other.value);
551 }
552
555 return bitAnd(value, other.value);
556 }
558 return bitOr(value, other.value);
559 }
561 return bitXor(value, other.value);
562 }
563
566 return neg(value);
567 }
569 return bitNot(value);
570 }
571
576 force_inline uint32_t vector_call sum() const noexcept {
577 return sum(value);
578 }
579
584 force_inline uint32_t vector_call anyMask() const noexcept {
585 return anyMask(value);
586 }
587 };
588
591
600 struct poly_float {
601#if VITAL_AVX2
602 static constexpr size_t kSize = 8;
603 typedef __m256 simd_type;
604 typedef __m256i mask_simd_type;
605#elif VITAL_SSE2
606 static constexpr size_t kSize = 4;
607 typedef __m128 simd_type;
608 typedef __m128i mask_simd_type;
609#elif VITAL_NEON
610 static constexpr size_t kSize = 4;
611 typedef float32x4_t simd_type;
612 typedef uint32x4_t mask_simd_type;
613#endif
614
620 simd_type simd;
621 float scalar[kSize];
622 };
623
629 float scalar[kSize];
630 simd_type simd;
631 };
632
638 static force_inline mask_simd_type vector_call toMask(simd_type value) {
639#if VITAL_AVX2
640 return _mm256_castps_si256(value);
641#elif VITAL_SSE2
642 return _mm_castps_si128(value);
643#elif VITAL_NEON
644 return vreinterpretq_u32_f32(value);
645#endif
646 }
647
653 static force_inline simd_type vector_call toSimd(mask_simd_type mask) {
654#if VITAL_AVX2
655 return _mm256_castsi256_ps(mask);
656#elif VITAL_SSE2
657 return _mm_castsi128_ps(mask);
658#elif VITAL_NEON
659 return vreinterpretq_f32_u32(mask);
660#endif
661 }
662
668 static force_inline simd_type vector_call init(float scalar) {
669#if VITAL_AVX2
670 return _mm256_broadcast_ss(&scalar);
671#elif VITAL_SSE2
672 return _mm_set1_ps(scalar);
673#elif VITAL_NEON
674 return vdupq_n_f32(scalar);
675#endif
676 }
677
683 static force_inline simd_type vector_call load(const float* memory) {
684#if VITAL_AVX2
685 return _mm256_loadu_ps(&scalar);
686#elif VITAL_SSE2
687 return _mm_loadu_ps(memory);
688#elif VITAL_NEON
689 return vld1q_f32(memory);
690#endif
691 }
692
699 static force_inline simd_type vector_call add(simd_type one, simd_type two) {
700#if VITAL_AVX2
701 return _mm256_add_ps(one, two);
702#elif VITAL_SSE2
703 return _mm_add_ps(one, two);
704#elif VITAL_NEON
705 return vaddq_f32(one, two);
706#endif
707 }
708
715 static force_inline simd_type vector_call sub(simd_type one, simd_type two) {
716#if VITAL_AVX2
717 return _mm256_sub_ps(one, two);
718#elif VITAL_SSE2
719 return _mm_sub_ps(one, two);
720#elif VITAL_NEON
721 return vsubq_f32(one, two);
722#endif
723 }
724
730 static force_inline simd_type vector_call neg(simd_type value) {
731#if VITAL_AVX2
732 return _mm256_xor_ps(value, _mm256_set1_ps(-0.f));
733#elif VITAL_SSE2
734 return _mm_xor_ps(value, _mm_set1_ps(-0.f));
735#elif VITAL_NEON
736 return vmulq_n_f32(value, -1.0f);
737#endif
738 }
739
746 static force_inline simd_type vector_call mul(simd_type one, simd_type two) {
747#if VITAL_AVX2
748 return _mm256_mul_ps(one, two);
749#elif VITAL_SSE2
750 return _mm_mul_ps(one, two);
751#elif VITAL_NEON
752 return vmulq_f32(one, two);
753#endif
754 }
755
762 static force_inline simd_type vector_call mulScalar(simd_type value, float scalar) {
763#if VITAL_AVX2
764 return _mm256_mul_ps(value, _mm_set1_ps(scalar));
765#elif VITAL_SSE2
766 return _mm_mul_ps(value, _mm_set1_ps(scalar));
767#elif VITAL_NEON
768 return vmulq_n_f32(value, scalar);
769#endif
770 }
771
779 static force_inline simd_type vector_call mulAdd(simd_type one, simd_type two, simd_type three) {
780#if VITAL_AVX2
781 return _mm256_fmadd_ps(two, three, one);
782#elif VITAL_SSE2
783 return _mm_add_ps(one, _mm_mul_ps(two, three));
784#elif VITAL_NEON
785#if defined(NEON_VFP_V3)
786 return vaddq_f32(one, vmulq_f32(two, three));
787#else
788 return vmlaq_f32(one, two, three);
789#endif
790#endif
791 }
792
800 static force_inline simd_type vector_call mulSub(simd_type one, simd_type two, simd_type three) {
801#if VITAL_AVX2
802 // _mm256_fsub_ps is not standard;
803 // some compilers offer it via FMA extension but it's not in the snippet.
804 // Could emulate: return _mm256_sub_ps(one, _mm256_mul_ps(two, three));
805 #error "AVX2 mulSub is not implemented in this snippet"
806#elif VITAL_SSE2
807 return _mm_sub_ps(one, _mm_mul_ps(two, three));
808#elif VITAL_NEON
809#if defined(NEON_VFP_V3)
810 return vsubq_f32(one, vmulq_f32(two, three));
811#else
812 return vmlsq_f32(one, two, three);
813#endif
814#endif
815 }
816
823 static force_inline simd_type vector_call div(simd_type one, simd_type two) {
824#if VITAL_AVX2
825 return _mm256_div_ps(one, two);
826#elif VITAL_SSE2
827 return _mm_div_ps(one, two);
828#elif VITAL_NEON
829#if defined(NEON_ARM32)
830 // Approximate reciprocal then refine
831 simd_type reciprocal = vrecpeq_f32(two);
832 reciprocal = vmulq_f32(vrecpsq_f32(two, reciprocal), reciprocal);
833 reciprocal = vmulq_f32(vrecpsq_f32(two, reciprocal), reciprocal);
834 return vmulq_f32(one, reciprocal);
835#else
836 return vdivq_f32(one, two);
837#endif
838#endif
839 }
840
847 static force_inline simd_type vector_call bitAnd(simd_type value, mask_simd_type mask) {
848#if VITAL_AVX2
849 return _mm256_and_ps(value, toSimd(mask));
850#elif VITAL_SSE2
851 return _mm_and_ps(value, toSimd(mask));
852#elif VITAL_NEON
853 return toSimd(vandq_u32(toMask(value), mask));
854#endif
855 }
856
863 static force_inline simd_type vector_call bitOr(simd_type value, mask_simd_type mask) {
864#if VITAL_AVX2
865 return _mm256_or_ps(value, toSimd(mask));
866#elif VITAL_SSE2
867 return _mm_or_ps(value, toSimd(mask));
868#elif VITAL_NEON
869 return toSimd(vorrq_u32(toMask(value), mask));
870#endif
871 }
872
879 static force_inline simd_type vector_call bitXor(simd_type value, mask_simd_type mask) {
880#if VITAL_AVX2
881 return _mm256_xor_ps(value, toSimd(mask));
882#elif VITAL_SSE2
883 return _mm_xor_ps(value, toSimd(mask));
884#elif VITAL_NEON
885 return toSimd(veorq_u32(toMask(value), mask));
886#endif
887 }
888
894 static force_inline simd_type vector_call bitNot(simd_type value) {
895 return bitXor(value, poly_mask::init(-1));
896 }
897
904 static force_inline simd_type vector_call max(simd_type one, simd_type two) {
905#if VITAL_AVX2
906 return _mm256_max_ps(one, two);
907#elif VITAL_SSE2
908 return _mm_max_ps(one, two);
909#elif VITAL_NEON
910 return vmaxq_f32(one, two);
911#endif
912 }
913
920 static force_inline simd_type vector_call min(simd_type one, simd_type two) {
921#if VITAL_AVX2
922 return _mm256_min_ps(one, two);
923#elif VITAL_SSE2
924 return _mm_min_ps(one, two);
925#elif VITAL_NEON
926 return vminq_f32(one, two);
927#endif
928 }
929
935 static force_inline simd_type vector_call abs(simd_type value) {
937 }
938
944 static force_inline mask_simd_type vector_call sign_mask(simd_type value) {
946 }
947
954 static force_inline mask_simd_type vector_call equal(simd_type one, simd_type two) {
955#if VITAL_AVX2
956 // In the snippet, it tries _mm256_cmpeq_ps with a second param, but the real call is just `_mm256_cmp_ps(one, two, _CMP_EQ_OQ)`.
957 return toMask(_mm256_cmp_ps(one, two, _CMP_EQ_OQ));
958#elif VITAL_SSE2
959 return toMask(_mm_cmpeq_ps(one, two));
960#elif VITAL_NEON
961 return vceqq_f32(one, two);
962#endif
963 }
964
971 static force_inline mask_simd_type vector_call greaterThan(simd_type one, simd_type two) {
972#if VITAL_AVX2
973 return toMask(_mm256_cmp_ps(one, two, _CMP_GT_OQ));
974#elif VITAL_SSE2
975 return toMask(_mm_cmpgt_ps(one, two));
976#elif VITAL_NEON
977 return vcgtq_f32(one, two);
978#endif
979 }
980
987 static force_inline mask_simd_type vector_call greaterThanOrEqual(simd_type one, simd_type two) {
988#if VITAL_AVX2
989 return toMask(_mm256_cmp_ps(one, two, _CMP_GE_OQ));
990#elif VITAL_SSE2
991 return toMask(_mm_cmpge_ps(one, two));
992#elif VITAL_NEON
993 return vcgeq_f32(one, two);
994#endif
995 }
996
1003 static force_inline mask_simd_type vector_call notEqual(simd_type one, simd_type two) {
1004#if VITAL_AVX2
1005 return toMask(_mm256_cmp_ps(one, two, _CMP_NEQ_OQ));
1006#elif VITAL_SSE2
1007 return toMask(_mm_cmpneq_ps(one, two));
1008#elif VITAL_NEON
1009 poly_mask greater = greaterThan(one, two);
1010 poly_mask less = lessThan(one, two);
1011 return poly_mask::bitOr(greater.value, less.value);
1012#endif
1013 }
1014
1020 static force_inline float vector_call sum(simd_type value) {
1021#if VITAL_AVX2
1022 // Example logic (not fully implemented):
1023 // simd_type flip = _mm256_permute2f128_ps(value, value, 1);
1024 // ...
1025 // return ...
1026 // Implementation incomplete in the snippet.
1027 #error "AVX2 version not fully implemented in code snippet"
1028#elif VITAL_SSE2
1029 simd_type flip = _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 0, 3, 2));
1030 simd_type sum_vec = _mm_add_ps(value, flip);
1031 simd_type swap = _mm_shuffle_ps(sum_vec, sum_vec, _MM_SHUFFLE(2, 3, 0, 1));
1032 return _mm_cvtss_f32(_mm_add_ps(sum_vec, swap));
1033#elif VITAL_NEON
1034 float32x2_t partial_sum = vpadd_f32(vget_low_f32(value), vget_high_f32(value));
1035 partial_sum = vpadd_f32(partial_sum, partial_sum);
1036 return vget_lane_f32(partial_sum, 0);
1037#endif
1038 }
1039
1048 static force_inline void vector_call transpose(simd_type& row0, simd_type& row1,
1049 simd_type& row2, simd_type& row3) {
1050#if VITAL_AVX2
1051 static_assert(false, "AVX2 transpose not supported yet");
1052#elif VITAL_SSE2
1053 __m128 low0 = _mm_unpacklo_ps(row0, row1);
1054 __m128 low1 = _mm_unpacklo_ps(row2, row3);
1055 __m128 high0 = _mm_unpackhi_ps(row0, row1);
1056 __m128 high1 = _mm_unpackhi_ps(row2, row3);
1057 row0 = _mm_movelh_ps(low0, low1);
1058 row1 = _mm_movehl_ps(low1, low0);
1059 row2 = _mm_movelh_ps(high0, high1);
1060 row3 = _mm_movehl_ps(high1, high0);
1061#elif VITAL_NEON
1062 float32x4x2_t swap_low = vtrnq_f32(row0, row1);
1063 float32x4x2_t swap_high = vtrnq_f32(row2, row3);
1064 // This snippet attempts to emulate a 4x4 transpose with vtrnq and vextq
1065 row0 = vextq_f32(vextq_f32(swap_low.val[0], swap_low.val[0], 2), swap_high.val[0], 2);
1066 row1 = vextq_f32(vextq_f32(swap_low.val[1], swap_low.val[1], 2), swap_high.val[1], 2);
1067 row2 = vextq_f32(swap_low.val[0], vextq_f32(swap_high.val[0], swap_high.val[0], 2), 2);
1068 row3 = vextq_f32(swap_low.val[1], vextq_f32(swap_high.val[1], swap_high.val[1], 2), 2);
1069#else
1070 // No-op or error for other platforms
1071#endif
1072 }
1073
1076 return mulAdd(one.value, two.value, three.value);
1077 }
1079 return mulSub(one.value, two.value, three.value);
1080 }
1082 return max(one.value, two.value);
1083 }
1085 return min(one.value, two.value);
1086 }
1088 return abs(value.value);
1089 }
1094 return equal(one.value, two.value);
1095 }
1097 return notEqual(one.value, two.value);
1098 }
1106 return greaterThan(two.value, one.value);
1107 }
1111
1112 simd_type value;
1113
1117 force_inline poly_float() noexcept { value = init(0.0f); }
1118
1123 force_inline poly_float(simd_type initial_value) noexcept : value(initial_value) { }
1124
1129 force_inline poly_float(float initial_value) noexcept {
1130 value = init(initial_value);
1131 }
1132
1138 force_inline poly_float(float initial_value1, float initial_value2) noexcept {
1139 scalar_simd_union union_value { initial_value1, initial_value2, initial_value1, initial_value2 };
1140 value = union_value.simd;
1141 }
1142
1150 force_inline poly_float(float first, float second, float third, float fourth) noexcept {
1151 scalar_simd_union union_value { first, second, third, fourth };
1152 value = union_value.simd;
1153 }
1154
1159
1165 force_inline float vector_call access(size_t index) const noexcept {
1166#if VITAL_AVX2
1167 simd_union union_value { value };
1168 return union_value.scalar[index];
1169#elif VITAL_SSE2
1170 simd_scalar_union union_value { value };
1171 return union_value.scalar[index];
1172#elif VITAL_NEON
1173 return value[index];
1174#endif
1175 }
1176
1182 force_inline void vector_call set(size_t index, float new_value) noexcept {
1183#if VITAL_AVX2
1184 simd_union union_value { value };
1185 union_value.scalar[index] = new_value;
1186 value = union_value.simd;
1187#elif VITAL_SSE2
1188 simd_scalar_union union_value { value };
1189 union_value.scalar[index] = new_value;
1190 value = union_value.simd;
1191#elif VITAL_NEON
1192 value[index] = new_value;
1193#endif
1194 }
1195
1201 force_inline float vector_call operator[](size_t index) const noexcept {
1202 return access(index);
1203 }
1204
1207 value = add(value, other.value);
1208 return *this;
1209 }
1211 value = sub(value, other.value);
1212 return *this;
1213 }
1215 value = mul(value, other.value);
1216 return *this;
1217 }
1219 value = div(value, other.value);
1220 return *this;
1221 }
1223 value = bitAnd(value, other.value);
1224 return *this;
1225 }
1227 value = bitOr(value, other.value);
1228 return *this;
1229 }
1231 value = bitXor(value, other.value);
1232 return *this;
1233 }
1234
1236 force_inline poly_float& vector_call operator+=(simd_type other) noexcept {
1237 value = add(value, other);
1238 return *this;
1239 }
1240 force_inline poly_float& vector_call operator-=(simd_type other) noexcept {
1241 value = sub(value, other);
1242 return *this;
1243 }
1244 force_inline poly_float& vector_call operator*=(simd_type other) noexcept {
1245 value = mul(value, other);
1246 return *this;
1247 }
1248 force_inline poly_float& vector_call operator/=(simd_type other) noexcept {
1249 value = div(value, other);
1250 return *this;
1251 }
1252 force_inline poly_float& vector_call operator&=(mask_simd_type other) noexcept {
1253 value = bitAnd(value, other);
1254 return *this;
1255 }
1256 force_inline poly_float& vector_call operator|=(mask_simd_type other) noexcept {
1257 value = bitOr(value, other);
1258 return *this;
1259 }
1260 force_inline poly_float& vector_call operator^=(mask_simd_type other) noexcept {
1261 value = bitXor(value, other);
1262 return *this;
1263 }
1264
1267 value = add(value, init(scalar));
1268 return *this;
1269 }
1271 value = sub(value, init(scalar));
1272 return *this;
1273 }
1275 value = mulScalar(value, scalar);
1276 return *this;
1277 }
1279 value = div(value, init(scalar));
1280 return *this;
1281 }
1282
1285 return add(value, other.value);
1286 }
1288 return sub(value, other.value);
1289 }
1291 return mul(value, other.value);
1292 }
1294 return div(value, other.value);
1295 }
1296 force_inline poly_float vector_call operator*(float scalar) const noexcept {
1297 return mulScalar(value, scalar);
1298 }
1300 return bitAnd(value, other.value);
1301 }
1303 return bitOr(value, other.value);
1304 }
1306 return bitXor(value, other.value);
1307 }
1308
1311 return neg(value);
1312 }
1314 return bitNot(value);
1315 }
1316
1321 force_inline float vector_call sum() const noexcept {
1322 return sum(value);
1323 }
1324 };
1325
1326} // namespace vital
#define vector_call
Definition common.h:24
#define force_inline
Definition common.h:23
Contains classes and functions used within the Vital synthesizer framework.
poly_int poly_mask
Alias for clarity; used as a mask type in poly_float.
Definition poly_values.h:590
Represents a vector of floating-point values using SIMD instructions.
Definition poly_values.h:600
static force_inline poly_mask vector_call equal(poly_float one, poly_float two)
Definition poly_values.h:1093
static force_inline simd_type vector_call bitXor(simd_type value, mask_simd_type mask)
Bitwise XOR of a float SIMD register with a mask.
Definition poly_values.h:879
force_inline poly_float &vector_call operator^=(poly_mask other) noexcept
Definition poly_values.h:1230
force_inline float vector_call operator[](size_t index) const noexcept
Operator[] overload (read-only).
Definition poly_values.h:1201
force_inline poly_float &vector_call operator/=(float scalar) noexcept
Definition poly_values.h:1278
static force_inline void vector_call transpose(simd_type &row0, simd_type &row1, simd_type &row2, simd_type &row3)
Performs an in-place 4x4 transpose of four SSE/NEON registers containing float data.
Definition poly_values.h:1048
static force_inline mask_simd_type vector_call greaterThanOrEqual(simd_type one, simd_type two)
Compares two SIMD float registers, element-wise, for greater than or equal.
Definition poly_values.h:987
static force_inline poly_float vector_call min(poly_float one, poly_float two)
Definition poly_values.h:1084
force_inline poly_float &vector_call operator/=(simd_type other) noexcept
Definition poly_values.h:1248
static force_inline mask_simd_type vector_call toMask(simd_type value)
Interprets the bits of a float SIMD register as a mask (integer).
Definition poly_values.h:638
force_inline poly_float &vector_call operator|=(mask_simd_type other) noexcept
Definition poly_values.h:1256
static force_inline simd_type vector_call mulSub(simd_type one, simd_type two, simd_type three)
Fused multiply-sub operation: one = one - (two * three).
Definition poly_values.h:800
static force_inline poly_mask vector_call greaterThan(poly_float one, poly_float two)
Definition poly_values.h:1099
force_inline poly_float &vector_call operator&=(poly_mask other) noexcept
Definition poly_values.h:1222
static force_inline simd_type vector_call mul(simd_type one, simd_type two)
Multiplies two SIMD float registers element-wise.
Definition poly_values.h:746
force_inline poly_float &vector_call operator-=(poly_float other) noexcept
Definition poly_values.h:1210
static force_inline simd_type vector_call sub(simd_type one, simd_type two)
Subtracts one SIMD float register from another.
Definition poly_values.h:715
force_inline ~poly_float() noexcept
Destructor.
Definition poly_values.h:1158
static force_inline simd_type vector_call toSimd(mask_simd_type mask)
Interprets the bits of a mask SIMD register as float SIMD.
Definition poly_values.h:653
static force_inline poly_float vector_call mulAdd(poly_float one, poly_float two, poly_float three)
Convenience overloads returning poly_float instead of simd_type:
Definition poly_values.h:1075
force_inline poly_float &vector_call operator+=(poly_float other) noexcept
Compound assignment operators using poly_float.
Definition poly_values.h:1206
static force_inline mask_simd_type vector_call equal(simd_type one, simd_type two)
Compares two SIMD float registers for equality, element-wise.
Definition poly_values.h:954
force_inline poly_float &vector_call operator+=(float scalar) noexcept
Compound assignment operators using a scalar.
Definition poly_values.h:1266
static force_inline poly_mask vector_call notEqual(poly_float one, poly_float two)
Definition poly_values.h:1096
static force_inline poly_mask vector_call lessThan(poly_float one, poly_float two)
Definition poly_values.h:1105
force_inline poly_float &vector_call operator*=(simd_type other) noexcept
Definition poly_values.h:1244
force_inline poly_float &vector_call operator+=(simd_type other) noexcept
Compound assignment operators using simd_type.
Definition poly_values.h:1236
force_inline poly_float(simd_type initial_value) noexcept
Constructs from a raw SIMD register.
Definition poly_values.h:1123
static force_inline simd_type vector_call mulScalar(simd_type value, float scalar)
Multiplies a SIMD float register by a float scalar.
Definition poly_values.h:762
simd_type value
The underlying SIMD register for float.
Definition poly_values.h:1112
static force_inline mask_simd_type vector_call sign_mask(simd_type value)
Extracts the sign bit mask from each element in the SIMD float register.
Definition poly_values.h:944
force_inline poly_float vector_call operator+(poly_float other) const noexcept
Arithmetic operators.
Definition poly_values.h:1284
force_inline poly_float &vector_call operator|=(poly_mask other) noexcept
Definition poly_values.h:1226
static force_inline simd_type vector_call div(simd_type one, simd_type two)
Divides one SIMD float register by another, element-wise.
Definition poly_values.h:823
static force_inline simd_type vector_call min(simd_type one, simd_type two)
Returns the element-wise minimum of two SIMD float registers.
Definition poly_values.h:920
force_inline poly_float &vector_call operator^=(mask_simd_type other) noexcept
Definition poly_values.h:1260
force_inline poly_float &vector_call operator-=(simd_type other) noexcept
Definition poly_values.h:1240
force_inline poly_float &vector_call operator-=(float scalar) noexcept
Definition poly_values.h:1270
force_inline poly_float vector_call operator~() const noexcept
Definition poly_values.h:1313
static force_inline simd_type vector_call init(float scalar)
Initializes a SIMD register with the same float repeated.
Definition poly_values.h:668
force_inline poly_float vector_call operator-(poly_float other) const noexcept
Definition poly_values.h:1287
force_inline poly_float vector_call operator&(poly_mask other) const noexcept
Definition poly_values.h:1299
static force_inline poly_float vector_call max(poly_float one, poly_float two)
Definition poly_values.h:1081
static force_inline simd_type vector_call add(simd_type one, simd_type two)
Adds two SIMD float registers.
Definition poly_values.h:699
force_inline poly_float vector_call operator|(poly_mask other) const noexcept
Definition poly_values.h:1302
force_inline poly_float vector_call operator*(float scalar) const noexcept
Definition poly_values.h:1296
force_inline poly_float &vector_call operator*=(float scalar) noexcept
Definition poly_values.h:1274
static force_inline poly_mask vector_call sign_mask(poly_float value)
Definition poly_values.h:1090
force_inline poly_float() noexcept
Default constructor. Initializes to zero (0.0f).
Definition poly_values.h:1117
force_inline poly_float &vector_call operator&=(mask_simd_type other) noexcept
Definition poly_values.h:1252
force_inline poly_float &vector_call operator*=(poly_float other) noexcept
Definition poly_values.h:1214
force_inline poly_float(float initial_value) noexcept
Constructs the SIMD register by broadcasting a single float value.
Definition poly_values.h:1129
static force_inline poly_float vector_call abs(poly_float value)
Definition poly_values.h:1087
static force_inline simd_type vector_call max(simd_type one, simd_type two)
Returns the element-wise maximum of two SIMD float registers.
Definition poly_values.h:904
static force_inline float vector_call sum(simd_type value)
Computes the sum of all elements in a SIMD float register.
Definition poly_values.h:1020
static force_inline simd_type vector_call abs(simd_type value)
Computes the absolute value of each element in the SIMD float register.
Definition poly_values.h:935
force_inline poly_float vector_call operator-() const noexcept
Unary operators.
Definition poly_values.h:1310
force_inline float vector_call sum() const noexcept
Sums all elements in the SIMD float register.
Definition poly_values.h:1321
static force_inline poly_mask vector_call lessThanOrEqual(poly_float one, poly_float two)
Definition poly_values.h:1108
force_inline poly_float vector_call operator/(poly_float other) const noexcept
Definition poly_values.h:1293
force_inline float vector_call access(size_t index) const noexcept
Accessor for an element in the SIMD register.
Definition poly_values.h:1165
static force_inline mask_simd_type vector_call notEqual(simd_type one, simd_type two)
Compares two SIMD float registers for non-equality, element-wise.
Definition poly_values.h:1003
static force_inline simd_type vector_call bitOr(simd_type value, mask_simd_type mask)
Bitwise OR of a float SIMD register with a mask.
Definition poly_values.h:863
static force_inline simd_type vector_call mulAdd(simd_type one, simd_type two, simd_type three)
Fused multiply-add operation: one = one + (two * three).
Definition poly_values.h:779
force_inline poly_float vector_call operator*(poly_float other) const noexcept
Definition poly_values.h:1290
static force_inline simd_type vector_call load(const float *memory)
Loads floating-point values from memory into a SIMD register.
Definition poly_values.h:683
force_inline poly_float(float initial_value1, float initial_value2) noexcept
Constructs a SIMD register by repeating two float values (for SSE2/NEON).
Definition poly_values.h:1138
force_inline void vector_call set(size_t index, float new_value) noexcept
Sets a specific element in the SIMD register.
Definition poly_values.h:1182
force_inline poly_float(float first, float second, float third, float fourth) noexcept
Constructs a SIMD register with four specified floats (for SSE2/NEON).
Definition poly_values.h:1150
static force_inline poly_float vector_call mulSub(poly_float one, poly_float two, poly_float three)
Definition poly_values.h:1078
force_inline poly_float &vector_call operator/=(poly_float other) noexcept
Definition poly_values.h:1218
static force_inline simd_type vector_call bitAnd(simd_type value, mask_simd_type mask)
Bitwise AND of a float SIMD register with a mask.
Definition poly_values.h:847
static force_inline simd_type vector_call bitNot(simd_type value)
Bitwise NOT of a float SIMD register.
Definition poly_values.h:894
force_inline poly_float vector_call operator^(poly_mask other) const noexcept
Definition poly_values.h:1305
static force_inline simd_type vector_call neg(simd_type value)
Negates a SIMD float register.
Definition poly_values.h:730
static force_inline mask_simd_type vector_call greaterThan(simd_type one, simd_type two)
Compares two SIMD float registers, element-wise, for greater than.
Definition poly_values.h:971
static force_inline poly_mask vector_call greaterThanOrEqual(poly_float one, poly_float two)
Definition poly_values.h:1102
Represents a vector of integer values using SIMD instructions.
Definition poly_values.h:56
force_inline poly_int &vector_call operator&=(simd_type other) noexcept
Definition poly_values.h:515
force_inline poly_int &vector_call operator&=(poly_int other) noexcept
Definition poly_values.h:489
force_inline poly_int(uint32_t first, uint32_t second) noexcept
Constructs a 4-element SIMD register by repeating two values (for SSE2/NEON).
Definition poly_values.h:423
simd_type value
The underlying SIMD register.
Definition poly_values.h:385
static force_inline simd_type vector_call mul(simd_type one, simd_type two)
Multiplies two SIMD integer registers element-wise.
Definition poly_values.h:173
force_inline void vector_call set(size_t index, uint32_t new_value) noexcept
Sets a specific element in the SIMD register.
Definition poly_values.h:453
force_inline poly_int &vector_call operator-=(uint32_t scalar) noexcept
Definition poly_values.h:533
force_inline poly_int &vector_call operator*=(poly_int other) noexcept
Definition poly_values.h:485
static force_inline simd_type vector_call init(uint32_t scalar)
Initializes a SIMD register with the same integer repeated.
Definition poly_values.h:95
force_inline poly_int &vector_call operator|=(poly_int other) noexcept
Definition poly_values.h:493
force_inline poly_int &vector_call operator+=(poly_int other) noexcept
Compound assignment operators using poly_int.
Definition poly_values.h:477
static force_inline poly_int vector_call greaterThan(poly_int one, poly_int two)
Definition poly_values.h:375
force_inline poly_int &vector_call operator+=(uint32_t scalar) noexcept
Compound assignment operators using a scalar.
Definition poly_values.h:529
force_inline poly_int vector_call operator-(poly_int other) const noexcept
Definition poly_values.h:546
static force_inline simd_type vector_call max(simd_type one, simd_type two)
Returns the element-wise maximum of two SIMD integer registers.
Definition poly_values.h:253
force_inline poly_int vector_call operator+(poly_int other) const noexcept
Arithmetic operators.
Definition poly_values.h:543
static force_inline uint32_t vector_call anyMask(simd_type value)
Returns a bitmask that indicates which bytes/elements in the register are non-zero.
Definition poly_values.h:352
force_inline poly_int vector_call operator|(poly_int other) const noexcept
Definition poly_values.h:557
static force_inline uint32_t vector_call sum(simd_type value)
Computes the sum of all elements in a SIMD integer register.
Definition poly_values.h:326
force_inline uint32_t vector_call operator[](size_t index) const noexcept
Operator[] overload (read-only).
Definition poly_values.h:472
force_inline poly_int(simd_type initial_value) noexcept
Constructs from a raw SIMD register.
Definition poly_values.h:396
force_inline uint32_t vector_call access(size_t index) const noexcept
Accessor for an element in the SIMD register.
Definition poly_values.h:436
static force_inline simd_type vector_call add(simd_type one, simd_type two)
Adds two SIMD integer registers.
Definition poly_values.h:126
static force_inline simd_type vector_call load(const uint32_t *memory)
Loads integer values from memory into a SIMD register.
Definition poly_values.h:110
force_inline poly_int vector_call operator^(poly_int other) const noexcept
Definition poly_values.h:560
static force_inline simd_type vector_call bitNot(simd_type value)
Bitwise NOT of a SIMD integer register.
Definition poly_values.h:243
force_inline poly_int &vector_call operator*=(uint32_t scalar) noexcept
Definition poly_values.h:537
force_inline poly_int &vector_call operator+=(simd_type other) noexcept
Compound assignment operators using simd_type.
Definition poly_values.h:503
force_inline poly_int &vector_call operator*=(simd_type other) noexcept
Definition poly_values.h:511
force_inline poly_int() noexcept
Default constructor. Initializes to zero.
Definition poly_values.h:390
force_inline poly_int &vector_call operator|=(simd_type other) noexcept
Definition poly_values.h:519
static force_inline poly_int vector_call min(poly_int one, poly_int two)
Definition poly_values.h:369
static force_inline simd_type vector_call neg(simd_type value)
Negates a SIMD integer register.
Definition poly_values.h:157
force_inline poly_int(uint32_t first, uint32_t second, uint32_t third, uint32_t fourth) noexcept
Constructs the SIMD register with four specified integers.
Definition poly_values.h:413
force_inline poly_int &vector_call operator^=(simd_type other) noexcept
Definition poly_values.h:523
static constexpr uint32_t kSignMask
Sign bit mask.
Definition poly_values.h:87
static force_inline poly_int vector_call equal(poly_int one, poly_int two)
Definition poly_values.h:372
force_inline poly_int vector_call operator*(poly_int other) const noexcept
Definition poly_values.h:549
force_inline poly_int vector_call operator&(poly_int other) const noexcept
Bitwise operators.
Definition poly_values.h:554
static force_inline poly_int vector_call max(poly_int one, poly_int two)
Convenience overloads returning poly_int instead of simd_type:
Definition poly_values.h:366
static force_inline simd_type vector_call bitOr(simd_type value, simd_type mask)
Bitwise OR of a SIMD integer register with another.
Definition poly_values.h:212
static force_inline simd_type vector_call min(simd_type one, simd_type two)
Returns the element-wise minimum of two SIMD integer registers.
Definition poly_values.h:272
force_inline uint32_t vector_call anyMask() const noexcept
Returns a bitmask for elements that are non-zero.
Definition poly_values.h:584
static constexpr uint32_t kFullMask
All bits set.
Definition poly_values.h:86
static constexpr uint32_t kNotSignMask
Inverted sign bit mask.
Definition poly_values.h:88
force_inline uint32_t vector_call sum() const noexcept
Sums all elements in the SIMD register.
Definition poly_values.h:576
static force_inline simd_type vector_call greaterThan(simd_type one, simd_type two)
Compares two SIMD integer registers, element-wise, for greater than.
Definition poly_values.h:309
force_inline poly_int vector_call operator-() const noexcept
Unary operators.
Definition poly_values.h:565
static force_inline simd_type vector_call equal(simd_type one, simd_type two)
Compares two SIMD integer registers for equality, element-wise.
Definition poly_values.h:291
force_inline poly_int(uint32_t initial_value) noexcept
Constructs the SIMD register by broadcasting a single integer value.
Definition poly_values.h:402
force_inline poly_int &vector_call operator-=(simd_type other) noexcept
Definition poly_values.h:507
static force_inline simd_type vector_call sub(simd_type one, simd_type two)
Subtracts one SIMD integer register from another.
Definition poly_values.h:142
force_inline ~poly_int() noexcept
Destructor.
Definition poly_values.h:429
static force_inline poly_int vector_call lessThan(poly_int one, poly_int two)
Definition poly_values.h:378
force_inline poly_int &vector_call operator^=(poly_int other) noexcept
Definition poly_values.h:497
static force_inline simd_type vector_call bitAnd(simd_type value, simd_type mask)
Bitwise AND of a SIMD integer register with another.
Definition poly_values.h:196
force_inline poly_int &vector_call operator-=(poly_int other) noexcept
Definition poly_values.h:481
static force_inline simd_type vector_call bitXor(simd_type value, simd_type mask)
Bitwise XOR of a SIMD integer register with another.
Definition poly_values.h:228
force_inline poly_int vector_call operator~() const noexcept
Definition poly_values.h:568
static force_inline uint32_t vector_call sum(poly_int value)
Definition poly_values.h:381
Helper union for copying between a scalar array and a SIMD type.
Definition poly_values.h:628
simd_type simd
Access data in SIMD form.
Definition poly_values.h:630
float scalar[kSize]
Access data in scalar form.
Definition poly_values.h:629
Helper union for copying between a SIMD type and a scalar array.
Definition poly_values.h:619
simd_type simd
Access data in SIMD form.
Definition poly_values.h:620
float scalar[kSize]
Access data in scalar form.
Definition poly_values.h:621
Helper union for copying between a scalar array and a SIMD type.
Definition poly_values.h:72
simd_type simd
Access data in SIMD form.
Definition poly_values.h:74
int32_t scalar[kSize]
Access data in scalar form.
Definition poly_values.h:73
Helper union for copying between a SIMD type and a scalar array.
Definition poly_values.h:81
int32_t scalar[kSize]
Access data in scalar form.
Definition poly_values.h:83
simd_type simd
Access data in SIMD form.
Definition poly_values.h:82