00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00026 #define EIGEN_PACKET_MATH_ALTIVEC_H
00027
00028 namespace internal {
00029
00030 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00031 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00032 #endif
00033
00034 #ifndef EIGEN_HAS_FUSE_CJMADD
00035 #define EIGEN_HAS_FUSE_CJMADD 1
00036 #endif
00037
00038
00039 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00040 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
00041 #endif
00042
00043 typedef __vector float Packet4f;
00044 typedef __vector int Packet4i;
00045 typedef __vector unsigned int Packet4ui;
00046 typedef __vector __bool int Packet4bi;
00047 typedef __vector short int Packet8i;
00048 typedef __vector unsigned char Packet16uc;
00049
00050
00051
00052
00053 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
00054 Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
00055
00056 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
00057 Packet4i p4i_##NAME = vec_splat_s32(X)
00058
00059 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00060 Packet4f p4f_##NAME = pset1<Packet4f>(X)
00061
00062 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00063 Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
00064
00065 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00066 Packet4i p4i_##NAME = pset1<Packet4i>(X)
00067
00068 #define DST_CHAN 1
00069 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
00070
00071
00072 static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
00073 static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
00074 static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
00075 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
00076 static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
00077
00078 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
00079 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
00080 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
00081 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
00082 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
00083 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
00084 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
00085
00086 template<> struct packet_traits<float> : default_packet_traits
00087 {
00088 typedef Packet4f type;
00089 enum {
00090 Vectorizable = 1,
00091 AlignedOnScalar = 1,
00092 size=4,
00093
00094
00095 HasSin = 0,
00096 HasCos = 0,
00097 HasLog = 0,
00098 HasExp = 0,
00099 HasSqrt = 0
00100 };
00101 };
00102 template<> struct packet_traits<int> : default_packet_traits
00103 {
00104 typedef Packet4i type;
00105 enum {
00106
00107 Vectorizable = 1,
00108 AlignedOnScalar = 1,
00109 size=4
00110 };
00111 };
00112
00113 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
00114 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
00161
00162 float EIGEN_ALIGN16 af[4];
00163 af[0] = from;
00164 Packet4f vc = vec_ld(0, af);
00165 vc = vec_splat(vc, 0);
00166 return vc;
00167 }
00168
00169 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
00170 int EIGEN_ALIGN16 ai[4];
00171 ai[0] = from;
00172 Packet4i vc = vec_ld(0, ai);
00173 vc = vec_splat(vc, 0);
00174 return vc;
00175 }
00176
00177 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
00178 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
00179
00180 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
00181 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
00182
00183 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
00184 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
00185
00186 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
00187 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
00188
00189 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
00227 {
00228 Packet4f t, y_0, y_1, res;
00229
00230
00231 y_0 = vec_re(b);
00232
00233
00234 t = vec_nmsub(y_0, b, p4f_ONE);
00235 y_1 = vec_madd(y_0, t, y_0);
00236
00237 res = vec_madd(a, y_1, p4f_ZERO);
00238 return res;
00239 }
00240
00241 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& , const Packet4i& )
00242 { eigen_assert(false && "packet integer division are not supported by AltiVec");
00243 return pset1<Packet4i>(0);
00244 }
00245
00246
00247 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
00248 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00249
00250 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
00251 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
00252
00253 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
00254 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
00255
00256
00257 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
00258 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
00259
00260 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
00261 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
00262
00263 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
00264 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
00265
00266 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
00267 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
00268
00269 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00270 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00271
00272 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00273 {
00274 EIGEN_DEBUG_ALIGNED_LOAD
00275
00276 Packet16uc MSQ, LSQ;
00277 Packet16uc mask;
00278 MSQ = vec_ld(0, (unsigned char *)from);
00279 LSQ = vec_ld(15, (unsigned char *)from);
00280 mask = vec_lvsl(0, from);
00281 return (Packet4f) vec_perm(MSQ, LSQ, mask);
00282
00283 }
00284 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00285 {
00286 EIGEN_DEBUG_ALIGNED_LOAD
00287
00288 Packet16uc MSQ, LSQ;
00289 Packet16uc mask;
00290 MSQ = vec_ld(0, (unsigned char *)from);
00291 LSQ = vec_ld(15, (unsigned char *)from);
00292 mask = vec_lvsl(0, from);
00293 return (Packet4i) vec_perm(MSQ, LSQ, mask);
00294 }
00295
00296 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
00297 {
00298 Packet4f p;
00299 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from);
00300 else p = ploadu<Packet4f>(from);
00301 return vec_perm(p, p, p16uc_DUPLICATE);
00302 }
00303 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
00304 {
00305 Packet4i p;
00306 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from);
00307 else p = ploadu<Packet4i>(from);
00308 return vec_perm(p, p, p16uc_DUPLICATE);
00309 }
00310
00311 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00312 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00313
00314 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
00315 {
00316 EIGEN_DEBUG_UNALIGNED_STORE
00317
00318
00319 Packet16uc MSQ, LSQ, edges;
00320 Packet16uc edgeAlign, align;
00321
00322 MSQ = vec_ld(0, (unsigned char *)to);
00323 LSQ = vec_ld(15, (unsigned char *)to);
00324 edgeAlign = vec_lvsl(0, to);
00325 edges=vec_perm(LSQ,MSQ,edgeAlign);
00326 align = vec_lvsr( 0, to );
00327 MSQ = vec_perm(edges,(Packet16uc)from,align);
00328 LSQ = vec_perm((Packet16uc)from,edges,align);
00329 vec_st( LSQ, 15, (unsigned char *)to );
00330 vec_st( MSQ, 0, (unsigned char *)to );
00331 }
00332 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
00333 {
00334 EIGEN_DEBUG_UNALIGNED_STORE
00335
00336
00337 Packet16uc MSQ, LSQ, edges;
00338 Packet16uc edgeAlign, align;
00339
00340 MSQ = vec_ld(0, (unsigned char *)to);
00341 LSQ = vec_ld(15, (unsigned char *)to);
00342 edgeAlign = vec_lvsl(0, to);
00343 edges=vec_perm(LSQ, MSQ, edgeAlign);
00344 align = vec_lvsr( 0, to );
00345 MSQ = vec_perm(edges, (Packet16uc) from, align);
00346 LSQ = vec_perm((Packet16uc) from, edges, align);
00347 vec_st( LSQ, 15, (unsigned char *)to );
00348 vec_st( MSQ, 0, (unsigned char *)to );
00349 }
00350
00351 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00352 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00353
00354 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00355 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00356
00357 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00358 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00359
00360 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
00361 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
00362
00363 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00364 {
00365 Packet4f b, sum;
00366 b = (Packet4f) vec_sld(a, a, 8);
00367 sum = vec_add(a, b);
00368 b = (Packet4f) vec_sld(sum, sum, 4);
00369 sum = vec_add(sum, b);
00370 return pfirst(sum);
00371 }
00372
00373 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00374 {
00375 Packet4f v[4], sum[4];
00376
00377
00378
00379
00380 v[0] = vec_mergeh(vecs[0], vecs[2]);
00381 v[1] = vec_mergel(vecs[0], vecs[2]);
00382 v[2] = vec_mergeh(vecs[1], vecs[3]);
00383 v[3] = vec_mergel(vecs[1], vecs[3]);
00384
00385 sum[0] = vec_mergeh(v[0], v[2]);
00386 sum[1] = vec_mergel(v[0], v[2]);
00387 sum[2] = vec_mergeh(v[1], v[3]);
00388 sum[3] = vec_mergel(v[1], v[3]);
00389
00390
00391
00392 sum[0] = vec_add(sum[0], sum[1]);
00393
00394 sum[1] = vec_add(sum[2], sum[3]);
00395
00396 sum[0] = vec_add(sum[0], sum[1]);
00397
00398 return sum[0];
00399 }
00400
00401 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00402 {
00403 Packet4i sum;
00404 sum = vec_sums(a, p4i_ZERO);
00405 sum = vec_sld(sum, p4i_ZERO, 12);
00406 return pfirst(sum);
00407 }
00408
00409 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00410 {
00411 Packet4i v[4], sum[4];
00412
00413
00414
00415
00416 v[0] = vec_mergeh(vecs[0], vecs[2]);
00417 v[1] = vec_mergel(vecs[0], vecs[2]);
00418 v[2] = vec_mergeh(vecs[1], vecs[3]);
00419 v[3] = vec_mergel(vecs[1], vecs[3]);
00420
00421 sum[0] = vec_mergeh(v[0], v[2]);
00422 sum[1] = vec_mergel(v[0], v[2]);
00423 sum[2] = vec_mergeh(v[1], v[3]);
00424 sum[3] = vec_mergel(v[1], v[3]);
00425
00426
00427
00428 sum[0] = vec_add(sum[0], sum[1]);
00429
00430 sum[1] = vec_add(sum[2], sum[3]);
00431
00432 sum[0] = vec_add(sum[0], sum[1]);
00433
00434 return sum[0];
00435 }
00436
00437
00438
00439 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00440 {
00441 Packet4f prod;
00442 prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
00443 return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
00444 }
00445
00446 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00447 {
00448 EIGEN_ALIGN16 int aux[4];
00449 pstore(aux, a);
00450 return aux[0] * aux[1] * aux[2] * aux[3];
00451 }
00452
00453
00454 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00455 {
00456 Packet4f b, res;
00457 b = vec_min(a, vec_sld(a, a, 8));
00458 res = vec_min(b, vec_sld(b, b, 4));
00459 return pfirst(res);
00460 }
00461
00462 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00463 {
00464 Packet4i b, res;
00465 b = vec_min(a, vec_sld(a, a, 8));
00466 res = vec_min(b, vec_sld(b, b, 4));
00467 return pfirst(res);
00468 }
00469
00470
00471 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00472 {
00473 Packet4f b, res;
00474 b = vec_max(a, vec_sld(a, a, 8));
00475 res = vec_max(b, vec_sld(b, b, 4));
00476 return pfirst(res);
00477 }
00478
00479 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00480 {
00481 Packet4i b, res;
00482 b = vec_max(a, vec_sld(a, a, 8));
00483 res = vec_max(b, vec_sld(b, b, 4));
00484 return pfirst(res);
00485 }
00486
00487 template<int Offset>
00488 struct palign_impl<Offset,Packet4f>
00489 {
00490 EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
00491 {
00492 if (Offset!=0)
00493 first = vec_sld(first, second, Offset*4);
00494 }
00495 };
00496
00497 template<int Offset>
00498 struct palign_impl<Offset,Packet4i>
00499 {
00500 EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
00501 {
00502 if (Offset!=0)
00503 first = vec_sld(first, second, Offset*4);
00504 }
00505 };
00506
00507 }
00508
00509 #endif // EIGEN_PACKET_MATH_ALTIVEC_H