BmnRoot
Loading...
Searching...
No Matches
P4_F32vec4.h
Go to the documentation of this file.
1#ifndef L1Algo_F32vec4P4_H
2#define L1Algo_F32vec4P4_H
4#include "vec_arithmetic.h"
5#include "xmmintrin.h"
7#include <cmath>
8#include <iostream>
10/**********************************
11 *
12 * Vector of four single floats
13 *
14 **********************************/
15
16// #pragma pack(push,16)/* Must ensure class & union 16-B aligned */
17
18// typedef __m128 VectorFloat __attribute__ ((aligned(16)));
19
20const union {
21 float f;
22 int i;
23} __f_one = {1.f};
25const union {
26 int i[4];
27 __m128 m;
28} __f32vec4_abs_mask_cheat = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}},
29 __f32vec4_sgn_mask_cheat = {{static_cast<int>(0x80000000), static_cast<int>(0x80000000), static_cast<int>(0x80000000),
30 static_cast<int>(0x80000000)}},
32 __f32vec4_true_cheat = {{static_cast<int>(0xFFFFFFFF), static_cast<int>(0xFFFFFFFF), static_cast<int>(0xFFFFFFFF),
33 static_cast<int>(0xFFFFFFFF)}},
34 __f32vec4_false_cheat = {{0x00000000, 0x00000000, 0x00000000, 0x00000000}};
35
36#define _f32vec4_abs_mask (static_cast<F32vec4>(__f32vec4_abs_mask_cheat.m))
37#define _f32vec4_sgn_mask (static_cast<F32vec4>(__f32vec4_sgn_mask_cheat.m))
38#define _f32vec4_zero (static_cast<F32vec4>(__f32vec4_zero_cheat.m))
39#define _f32vec4_one (static_cast<F32vec4>(__f32vec4_one_cheat.m))
40#define _f32vec4_true (static_cast<F32vec4>(__f32vec4_true_cheat.m))
41#define _f32vec4_false (static_cast<F32vec4>(__f32vec4_false_cheat.m))
42
44{
45 public:
46 __m128 v;
47
48 float& operator[](int i) { return (reinterpret_cast<float*>(&v))[i]; }
49 float operator[](int i) const { return (reinterpret_cast<const float*>(&v))[i]; }
50
52 : v(_mm_set_ps1(0))
53 {}
54 F32vec4(const __m128& a)
55 : v(a)
56 {}
57 F32vec4(const float& a)
58 : v(_mm_set_ps1(a))
59 {}
61 F32vec4(const float& f0, const float& f1, const float& f2, const float& f3)
62 : v(_mm_set_ps(f3, f2, f1, f0))
63 {}
65 /* Conversion function */
66 operator __m128() const { return v; } /* Convert to __m128 */
67
68 /* Arithmetic Operators */
69 friend F32vec4 operator+(const F32vec4& a, const F32vec4& b) { return _mm_add_ps(a, b); }
70 friend F32vec4 operator-(const F32vec4& a, const F32vec4& b) { return _mm_sub_ps(a, b); }
71 friend F32vec4 operator*(const F32vec4& a, const F32vec4& b) { return _mm_mul_ps(a, b); }
72 friend F32vec4 operator/(const F32vec4& a, const F32vec4& b) { return _mm_div_ps(a, b); }
73
74 /* Functions */
75 friend F32vec4 min(const F32vec4& a, const F32vec4& b) { return _mm_min_ps(a, b); }
76 friend F32vec4 max(const F32vec4& a, const F32vec4& b) { return _mm_max_ps(a, b); }
77
78 /* Square Root */
79 friend F32vec4 sqrt(const F32vec4& a) { return _mm_sqrt_ps(a); }
80
81 /* Reciprocal( inverse) Square Root */
82 friend F32vec4 rsqrt(const F32vec4& a) { return _mm_rsqrt_ps(a); }
83
84 /* Reciprocal (inversion) */
85 // friend F32vec4 rcp ( const F32vec4 &a ){ return _mm_rcp_ps (a); }
86 /* Reciprocal (inversion) */
87 // friend F32vec4 rcp ( const F32vec4 &a ){ return 1. / a; }
88 /* NewtonRaphson Reciprocal
89 [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))] */
90 friend F32vec4 rcp(const F32vec4& a)
91 {
92 F32vec4 Ra0 = _mm_rcp_ps(a);
93 return _mm_sub_ps(_mm_add_ps(Ra0, Ra0), _mm_mul_ps(_mm_mul_ps(Ra0, a), Ra0));
94 }
95
96 /* Absolute value */
97 friend F32vec4 fabs(const F32vec4& a) { return _mm_and_ps(a, _f32vec4_abs_mask); }
99 /* Sign */
100 friend F32vec4 sgn(const F32vec4& a) { return _mm_or_ps(_mm_and_ps(a, _f32vec4_sgn_mask), _f32vec4_one); }
101 friend F32vec4 asgnb(const F32vec4& a, const F32vec4& b) { return _mm_or_ps(_mm_and_ps(b, _f32vec4_sgn_mask), a); }
102
103 /* Logical */
104
105 friend F32vec4 operator&(const F32vec4& a, const F32vec4& b)
106 { // mask returned
107 return _mm_and_ps(a, b);
108 }
109 friend F32vec4 operator|(const F32vec4& a, const F32vec4& b)
110 { // mask returned
111 return _mm_or_ps(a, b);
112 }
113 friend F32vec4 operator^(const F32vec4& a, const F32vec4& b)
114 { // mask returned
115 return _mm_xor_ps(a, b);
117 friend F32vec4 operator!(const F32vec4& a)
118 { // mask returned
119 return _mm_xor_ps(a, _f32vec4_true);
120 }
121 // friend F32vec4 operator||( const F32vec4 &a, const F32vec4 &b ){ // mask returned
122 // return _mm_or_ps(a, b);
123 // }
125 /* Comparison */
127 friend F32vec4 operator<(const F32vec4& a, const F32vec4& b)
128 { // mask returned
129 return _mm_cmplt_ps(a, b);
131 friend F32vec4 operator<=(const F32vec4& a, const F32vec4& b)
132 { // mask returned
133 return _mm_cmple_ps(a, b);
134 }
135 friend F32vec4 operator>(const F32vec4& a, const F32vec4& b)
136 { // mask returned
137 return _mm_cmpgt_ps(a, b);
138 }
139 friend F32vec4 operator>=(const F32vec4& a, const F32vec4& b)
140 { // mask returned
141 return _mm_cmpge_ps(a, b);
142 }
143 friend F32vec4 operator==(const F32vec4& a, const F32vec4& b)
144 { // mask returned
145 return _mm_cmpeq_ps(a, b);
146 }
147
148#define if3(a, b, c) ((a) & (b)) | ((!(a)) & (c)) // analog (a) ? b : c
149
150#define Vec4NotEmpty(a) bool((a)[0]) | bool((a)[1]) | bool((a)[2]) | bool((a)[3])
151#define Vec4Empty(a) !(bool((a)[0]) | bool((a)[1]) | bool((a)[2]) | bool((a)[3]))
152 // bool NotEmpty(const F32vec4 &a) { return a[0]||a[1]||a[2]||a[3]; }
153 // bool Empty(const F32vec4 &a) { return !(a[0]||a[1]||a[2]||a[3]); } // optimize
154 friend F32vec4 bool2int(const F32vec4& a)
155 { // mask returned
156 return if3(a, 1, 0);
157 }
158
159 /* Define all operators for consistensy */
160
162
163 /* Non intrinsic functions */
164
165#define _f1(A, F) F32vec4(F(A[0]), F(A[1]), F(A[2]), F(A[3]))
166
167 friend F32vec4 exp(const F32vec4& a) { return _f1(a, exp); }
168 friend F32vec4 log(const F32vec4& a) { return _f1(a, log); }
169 friend F32vec4 sin(const F32vec4& a) { return _f1(a, sin); }
170 friend F32vec4 cos(const F32vec4& a) { return _f1(a, cos); }
171 friend F32vec4 acos(const F32vec4& a) { return _f1(a, acos); }
172
173#undef _f1
174
175 friend F32vec4 atan2(const F32vec4& y, const F32vec4& x)
177 const F32vec4 pi(3.1415926535897932);
178 const F32vec4 pi_2 = pi / 2;
179 const F32vec4 zero(0);
180
181 const F32vec4& xZero = F32vec4(x == zero);
182 const F32vec4& yZero = F32vec4(y == zero);
183 const F32vec4& xNeg = F32vec4(x < zero);
184 const F32vec4& yNeg = F32vec4(y < zero);
185
186 const F32vec4& absX = fabs(x);
187 const F32vec4& absY = fabs(y);
188
189 F32vec4 a = absY / absX;
190 const F32vec4 pi_4 = pi / 4;
191 const F32vec4& gt_tan_3pi_8 = F32vec4(a > F32vec4(2.414213562373095));
192 const F32vec4& gt_tan_pi_8 = F32vec4(a > F32vec4(0.4142135623730950)) & F32vec4(!gt_tan_3pi_8);
193 const F32vec4 minusOne(-1);
194 F32vec4 b(zero);
195 b = (pi_2 & gt_tan_3pi_8) + (F32vec4(!gt_tan_3pi_8) & b);
196 b = (pi_4 & gt_tan_pi_8) + (F32vec4(!gt_tan_pi_8) & b);
197 a = (gt_tan_3pi_8 & (minusOne / a)) + (F32vec4(!gt_tan_3pi_8) & a);
198 a = (gt_tan_pi_8 & ((absY - absX) / (absY + absX))) + (F32vec4(!gt_tan_pi_8) & a);
199 const F32vec4& a2 = a * a;
200 b +=
201 (((8.05374449538e-2 * a2 - 1.38776856032E-1) * a2 + 1.99777106478E-1) * a2 - 3.33329491539E-1) * a2 * a + a;
202 F32vec4 xyNeg = F32vec4(xNeg ^ yNeg);
203 b = (xyNeg & (-b)) + (F32vec4(!xyNeg) & b);
204 xyNeg = F32vec4(xNeg & !yNeg);
205 b = (xyNeg & (b + pi)) + (F32vec4(!xyNeg) & b);
206 xyNeg = F32vec4(xNeg & yNeg);
207 b = (xyNeg & (b - pi)) + (F32vec4(!xyNeg) & b);
208 xyNeg = F32vec4(xZero & yZero);
209 b = (xyNeg & zero) + (F32vec4(!xyNeg) & b);
210 xyNeg = F32vec4(xZero & yNeg);
211 b = (xyNeg & (-pi_2)) + (F32vec4(!xyNeg) & b);
212 return b;
213 }
214
215 friend std::ostream& operator<<(std::ostream& strm, const F32vec4& a)
216 {
217 strm << "[" << a[0] << " " << a[1] << " " << a[2] << " " << a[3] << "]";
218 return strm;
219 }
220
221 friend std::istream& operator>>(std::istream& strm, F32vec4& a)
222 {
223 float tmp;
224 strm >> tmp;
225 a = tmp;
226 return strm;
227 }
228
229} __attribute__((aligned(16)));
230
231typedef F32vec4 fvec;
232typedef float fscal;
233const int fvecLen = 4;
234// #define fvec_true _f32vec4_true
235// #define fvec_false _f32vec4_false
236#define _fvecalignment __attribute__((aligned(16)))
237
238#include "std_alloc.h"
239
240#endif
float fscal
Definition P4_F32vec4.h:232
const int fvecLen
Definition P4_F32vec4.h:233
int i
Definition P4_F32vec4.h:22
const union @17 __f32vec4_false_cheat
__m128 m
Definition P4_F32vec4.h:27
const union @17 __f32vec4_zero_cheat
#define _f32vec4_true
Definition P4_F32vec4.h:40
const union @17 __f32vec4_one_cheat
const union @17 __f32vec4_sgn_mask_cheat
const union @16 __f_one
#define _f32vec4_sgn_mask
Definition P4_F32vec4.h:37
#define _f32vec4_abs_mask
Definition P4_F32vec4.h:36
#define _f32vec4_one
Definition P4_F32vec4.h:39
const union @17 __f32vec4_abs_mask_cheat
const union @17 __f32vec4_true_cheat
float f
Definition P4_F32vec4.h:21
F32vec4 fvec
Definition P4_F32vec4.h:231
nsL1vector __attribute__
#define _f1(A, F)
F32vec4(const float &a)
Definition P4_F32vec4.h:57
friend F32vec4 operator>(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:135
float & operator[](int i)
Definition P4_F32vec4.h:48
friend F32vec4 acos(const F32vec4 &a)
Definition P4_F32vec4.h:171
friend F32vec4 sgn(const F32vec4 &a)
Definition P4_F32vec4.h:100
friend F32vec4 max(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:76
friend F32vec4 operator|(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:109
friend F32vec4 rcp(const F32vec4 &a)
Definition P4_F32vec4.h:90
__m128 v
Definition P4_F32vec4.h:46
float operator[](int i) const
Definition P4_F32vec4.h:49
friend F32vec4 sin(const F32vec4 &a)
Definition P4_F32vec4.h:169
friend F32vec4 if3(const F32vec4 &a, const F32vec4 &b, const F32vec4 &c)
friend F32vec4 operator-(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:70
friend F32vec4 fabs(const F32vec4 &a)
Definition P4_F32vec4.h:97
friend F32vec4 bool2int(const F32vec4 &a)
Definition P4_F32vec4.h:154
friend F32vec4 rsqrt(const F32vec4 &a)
Definition P4_F32vec4.h:82
friend F32vec4 asgnb(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:101
friend F32vec4 operator<=(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:131
friend F32vec4 operator&(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:105
friend std::istream & operator>>(std::istream &strm, F32vec4 &a)
Definition P4_F32vec4.h:221
F32vec4(const float &f0, const float &f1, const float &f2, const float &f3)
Definition P4_F32vec4.h:61
friend F32vec4 operator<(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:127
friend F32vec4 atan2(const F32vec4 &y, const F32vec4 &x)
Definition P4_F32vec4.h:175
friend F32vec4 operator>=(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:139
friend F32vec4 operator*(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:71
friend std::ostream & operator<<(std::ostream &strm, const F32vec4 &a)
Definition P4_F32vec4.h:215
friend F32vec4 exp(const F32vec4 &a)
Definition P4_F32vec4.h:167
friend F32vec4 operator+(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:69
vec_arithmetic(F32vec4, float)
friend F32vec4 operator/(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:72
friend F32vec4 sqrt(const F32vec4 &a)
Definition P4_F32vec4.h:79
friend F32vec4 log(const F32vec4 &a)
Definition P4_F32vec4.h:168
friend F32vec4 operator!(const F32vec4 &a)
Definition P4_F32vec4.h:117
friend F32vec4 cos(const F32vec4 &a)
Definition P4_F32vec4.h:170
friend F32vec4 operator^(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:113
F32vec4(const __m128 &a)
Definition P4_F32vec4.h:54
friend F32vec4 operator==(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:143
friend F32vec4 min(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:75