00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #ifndef EIGEN_INVERSE_SSE_H
00043 #define EIGEN_INVERSE_SSE_H
00044
00045 namespace internal {
00046
00047 template<typename MatrixType, typename ResultType>
00048 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
00049 {
00050 enum {
00051 MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
00052 ResultAlignment = bool(ResultType::Flags&AlignedBit),
00053 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
00054 };
00055
00056 static void run(const MatrixType& matrix, ResultType& result)
00057 {
00058 EIGEN_ALIGN16 const int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
00059
00060
00061 __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
00062 __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
00063 __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
00064 __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
00065
00066
00067
00068
00069
00070
00071
00072 __m128 A, B, C, D;
00073 if(!StorageOrdersMatch)
00074 {
00075 A = _mm_unpacklo_ps(_L1, _L2);
00076 B = _mm_unpacklo_ps(_L3, _L4);
00077 C = _mm_unpackhi_ps(_L1, _L2);
00078 D = _mm_unpackhi_ps(_L3, _L4);
00079 }
00080 else
00081 {
00082 A = _mm_movelh_ps(_L1, _L2);
00083 B = _mm_movehl_ps(_L2, _L1);
00084 C = _mm_movelh_ps(_L3, _L4);
00085 D = _mm_movehl_ps(_L4, _L3);
00086 }
00087
00088 __m128 iA, iB, iC, iD,
00089 DC, AB;
00090 __m128 dA, dB, dC, dD;
00091 __m128 det, d, d1, d2;
00092 __m128 rd;
00093
00094
00095 AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
00096 AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
00097
00098 DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
00099 DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
00100
00101
00102 dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
00103 dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
00104
00105 dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
00106 dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
00107
00108
00109 dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
00110 dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
00111
00112 dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
00113 dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
00114
00115
00116 d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
00117
00118
00119 iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
00120 iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
00121
00122 iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
00123 iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
00124
00125
00126 d = _mm_add_ps(d, _mm_movehl_ps(d, d));
00127 d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
00128 d1 = _mm_mul_ss(dA,dD);
00129 d2 = _mm_mul_ss(dB,dC);
00130
00131
00132 iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
00133
00134
00135 iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
00136
00137
00138 det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
00139 rd = _mm_div_ss(_mm_set_ss(1.0f), det);
00140
00141
00142
00143
00144
00145
00146 iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
00147 iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
00148
00149 iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
00150 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
00151
00152 rd = _mm_shuffle_ps(rd,rd,0);
00153 rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
00154
00155
00156 iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
00157
00158
00159 iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
00160
00161
00162 iA = _mm_mul_ps(rd,iA);
00163 iB = _mm_mul_ps(rd,iB);
00164 iC = _mm_mul_ps(rd,iC);
00165 iD = _mm_mul_ps(rd,iD);
00166
00167 result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
00168 result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
00169 result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
00170 result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
00171 }
00172
00173 };
00174
00175 template<typename MatrixType, typename ResultType>
00176 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
00177 {
00178 enum {
00179 MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
00180 ResultAlignment = bool(ResultType::Flags&AlignedBit),
00181 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
00182 };
00183 static void run(const MatrixType& matrix, ResultType& result)
00184 {
00185 const EIGEN_ALIGN16 long long int _Sign_NP[2] = { 0x8000000000000000ll, 0x0000000000000000ll };
00186 const EIGEN_ALIGN16 long long int _Sign_PN[2] = { 0x0000000000000000ll, 0x8000000000000000ll };
00187
00188
00189
00190
00191
00192
00193
00194
00195 __m128d A1, A2, B1, B2, C1, C2, D1, D2;
00196
00197 if(StorageOrdersMatch)
00198 {
00199 A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
00200 A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
00201 C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
00202 C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
00203 }
00204 else
00205 {
00206 __m128d tmp;
00207 A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
00208 A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
00209 tmp = A1;
00210 A1 = _mm_unpacklo_pd(A1,A2);
00211 A2 = _mm_unpackhi_pd(tmp,A2);
00212 tmp = C1;
00213 C1 = _mm_unpacklo_pd(C1,C2);
00214 C2 = _mm_unpackhi_pd(tmp,C2);
00215
00216 B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
00217 B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
00218 tmp = B1;
00219 B1 = _mm_unpacklo_pd(B1,B2);
00220 B2 = _mm_unpackhi_pd(tmp,B2);
00221 tmp = D1;
00222 D1 = _mm_unpacklo_pd(D1,D2);
00223 D2 = _mm_unpackhi_pd(tmp,D2);
00224 }
00225
00226 __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,
00227 DC1, DC2, AB1, AB2;
00228 __m128d dA, dB, dC, dD;
00229 __m128d det, d1, d2, rd;
00230
00231
00232 dA = _mm_shuffle_pd(A2, A2, 1);
00233 dA = _mm_mul_pd(A1, dA);
00234 dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
00235
00236 dB = _mm_shuffle_pd(B2, B2, 1);
00237 dB = _mm_mul_pd(B1, dB);
00238 dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
00239
00240
00241 AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
00242 AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
00243 AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
00244 AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
00245
00246
00247 dC = _mm_shuffle_pd(C2, C2, 1);
00248 dC = _mm_mul_pd(C1, dC);
00249 dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
00250
00251 dD = _mm_shuffle_pd(D2, D2, 1);
00252 dD = _mm_mul_pd(D1, dD);
00253 dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
00254
00255
00256 DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
00257 DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
00258 DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
00259 DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
00260
00261
00262 d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
00263 d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
00264 rd = _mm_add_pd(d1, d2);
00265 rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
00266
00267
00268 iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
00269 iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
00270 iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
00271 iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
00272
00273
00274 iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
00275 iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
00276 iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
00277 iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
00278
00279
00280 dA = _mm_shuffle_pd(dA,dA,0);
00281 iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
00282 iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
00283
00284
00285 dD = _mm_shuffle_pd(dD,dD,0);
00286 iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
00287 iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
00288
00289 d1 = _mm_mul_sd(dA, dD);
00290 d2 = _mm_mul_sd(dB, dC);
00291
00292
00293 iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
00294 iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
00295 iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
00296 iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
00297
00298
00299 det = _mm_add_sd(d1, d2);
00300 det = _mm_sub_sd(det, rd);
00301
00302
00303 iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
00304 iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
00305 iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
00306 iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
00307
00308 rd = _mm_div_sd(_mm_set_sd(1.0), det);
00309
00310
00311
00312 rd = _mm_shuffle_pd(rd,rd,0);
00313
00314
00315 dB = _mm_shuffle_pd(dB,dB,0);
00316 iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
00317 iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
00318
00319 d1 = _mm_xor_pd(rd, _mm_load_pd((double*)_Sign_PN));
00320 d2 = _mm_xor_pd(rd, _mm_load_pd((double*)_Sign_NP));
00321
00322
00323 dC = _mm_shuffle_pd(dC,dC,0);
00324 iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
00325 iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
00326
00327 result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
00328 result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
00329 result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
00330 result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
00331 result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
00332 result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
00333 result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
00334 result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
00335 }
00336 };
00337
00338 }
00339
00340 #endif // EIGEN_INVERSE_SSE_H