summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/eigen/Eigen/src/Core/products
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/eigen/Eigen/src/Core/products')
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h2645
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h517
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h317
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h145
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h124
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h518
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h136
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h180
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h544
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h295
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h262
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h118
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h133
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h94
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h472
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h317
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h350
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h255
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h337
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h167
-rw-r--r--src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h148
21 files changed, 8074 insertions, 0 deletions
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
new file mode 100644
index 000000000..f35b760c1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -0,0 +1,2645 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+
+namespace Eigen {
+
+namespace internal {
+
+enum GEBPPacketSizeType {
+ GEBPPacketFull = 0,
+ GEBPPacketHalf,
+ GEBPPacketQuarter
+};
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
+class gebp_traits;
+
+
+/** \internal \returns b if a<=0, and returns a otherwise. */
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
+{
+ return a<=0 ? b : a;
+}
+
+#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
+#elif EIGEN_ARCH_PPC
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
+#else
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
+#endif
+
+#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+
+/** \internal */
+struct CacheSizes {
+ CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
+ int l1CacheSize, l2CacheSize, l3CacheSize;
+ queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+ m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+ m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+ m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
+ }
+
+ std::ptrdiff_t m_l1;
+ std::ptrdiff_t m_l2;
+ std::ptrdiff_t m_l3;
+};
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+ static CacheSizes m_cacheSizes;
+
+ if(action==SetAction)
+ {
+ // set the cpu cache size and cache all block sizes from a global cache size in byte
+ eigen_internal_assert(l1!=0 && l2!=0);
+ m_cacheSizes.m_l1 = *l1;
+ m_cacheSizes.m_l2 = *l2;
+ m_cacheSizes.m_l3 = *l3;
+ }
+ else if(action==GetAction)
+ {
+ eigen_internal_assert(l1!=0 && l2!=0);
+ *l1 = m_cacheSizes.m_l1;
+ *l2 = m_cacheSizes.m_l2;
+ *l3 = m_cacheSizes.m_l3;
+ }
+ else
+ {
+ eigen_internal_assert(false);
+ }
+}
+
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+ // Explanations:
+ // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+ // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+ // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+ // at the register level. This small horizontal panel has to stay within L1 cache.
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ #ifdef EIGEN_VECTORIZE_AVX512
+ // We need to find a rationale for that, but without this adjustment,
+ // performance with AVX512 is pretty bad, like -20% slower.
+ // One reason is that with increasing packet-size, the blocking size k
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+ // This is quite small for a good reuse of the accumulation registers.
+ l1 *= 4;
+ #endif
+
+ if (num_threads > 1) {
+ typedef typename Traits::ResScalar ResScalar;
+ enum {
+ kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+ ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+ kr = 8,
+ mr = Traits::mr,
+ nr = Traits::nr
+ };
+ // Increasing k gives us more time to prefetch the content of the "C"
+ // registers. However once the latency is hidden there is no point in
+ // increasing the value of k, so we'll cap it at 320 (value determined
+ // experimentally).
+ // To avoid that k vanishes, we make k_cache at least as big as kr
+ const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
+ if (k_cache < k) {
+ k = k_cache - (k_cache % kr);
+ eigen_internal_assert(k > 0);
+ }
+
+ const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+ const Index n_per_thread = numext::div_ceil(n, num_threads);
+ if (n_cache <= n_per_thread) {
+ // Don't exceed the capacity of the l2 cache.
+ eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+ n = n_cache - (n_cache % nr);
+ eigen_internal_assert(n > 0);
+ } else {
+ n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
+ }
+
+ if (l3 > l2) {
+ // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+ const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+ const Index m_per_thread = numext::div_ceil(m, num_threads);
+ if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+ m = m_cache - (m_cache % mr);
+ eigen_internal_assert(m > 0);
+ } else {
+ m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
+ }
+ }
+ }
+ else {
+ // In unit tests we do not want to use extra large matrices,
+ // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+ l1 = 9*1024;
+ l2 = 32*1024;
+ l3 = 512*1024;
+#endif
+
+ // Early return for small problems because the computation below are time consuming for small problems.
+ // Perhaps it would make more sense to consider k*n*m??
+ // Note that for very tiny problem, this function should be bypassed anyway
+ // because we use the coefficient-based implementation for them.
+ if((numext::maxi)(k,(numext::maxi)(m,n))<48)
+ return;
+
+ typedef typename Traits::ResScalar ResScalar;
+ enum {
+ k_peeling = 8,
+ k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+ k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+ };
+
+ // ---- 1st level of blocking on L1, yields kc ----
+
+ // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+ // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+ // We also include a register-level block of the result (mx x nr).
+ // (In an ideal world only the lhs panel would stay in L1)
+ // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+ const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
+ const Index old_k = k;
+ if(k>max_kc)
+ {
+ // We are really blocking on the third dimension:
+ // -> reduce blocking size to make sure the last block is as large as possible
+ // while keeping the same number of sweeps over the result.
+ k = (k%max_kc)==0 ? max_kc
+ : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+
+ eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+ }
+
+ // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+
+ // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+ // actual_l2 = max(l2, l3/nb_core_sharing_l3)
+ // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+ // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+ const Index actual_l2 = l3;
+ #else
+ const Index actual_l2 = 1572864; // == 1.5 MB
+ #endif
+
+ // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+ // The second half is implicitly reserved to access the result and lhs coefficients.
+ // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+ // to limit this growth: we bound nc to growth by a factor x1.5.
+ // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+ // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+ Index max_nc;
+ const Index lhs_bytes = m * k * sizeof(LhsScalar);
+ const Index remaining_l1 = l1- k_sub - lhs_bytes;
+ if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+ {
+ // L1 blocking
+ max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+ }
+ else
+ {
+ // L2 blocking
+ max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+ }
+ // WARNING Below, we assume that Traits::nr is a power of two.
+ Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+ if(n>nc)
+ {
+ // We are really blocking over the columns:
+ // -> reduce blocking size to make sure the last block is as large as possible
+ // while keeping the same number of sweeps over the packed lhs.
+ // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+ n = (n%nc)==0 ? nc
+ : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+ }
+ else if(old_k==k)
+ {
+ // So far, no blocking at all, i.e., kc==k, and nc==n.
+ // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+ // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+ Index problem_size = k*n*sizeof(LhsScalar);
+ Index actual_lm = actual_l2;
+ Index max_mc = m;
+ if(problem_size<=1024)
+ {
+ // problem is small enough to keep in L1
+ // Let's choose m such that lhs's block fit in 1/3 of L1
+ actual_lm = l1;
+ }
+ else if(l3!=0 && problem_size<=32768)
+ {
+ // we have both L2 and L3, and problem is small enough to be kept in L2
+ // Let's choose m such that lhs's block fit in 1/3 of L2
+ actual_lm = l2;
+ max_mc = (numext::mini<Index>)(576,max_mc);
+ }
+ Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+ if (mc > Traits::mr) mc -= mc % Traits::mr;
+ else if (mc==0) return;
+ m = (m%mc)==0 ? mc
+ : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+ }
+ }
+}
+
+template <typename Index>
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+ if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+ k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+ m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+ n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+ return true;
+ }
+#else
+ EIGEN_UNUSED_VARIABLE(k)
+ EIGEN_UNUSED_VARIABLE(m)
+ EIGEN_UNUSED_VARIABLE(n)
+#endif
+ return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+ *
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms.
+ *
+ * The blocking size parameters may be evaluated:
+ * - either by a heuristic based on cache sizes;
+ * - or using fixed prescribed values (for testing purposes).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+ if (!useSpecificBlockingSizes(k, m, n)) {
+ evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
+ }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+ computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
+}
+
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
+
+template <typename Packet>
+struct QuadPacket
+{
+ Packet B_0, B1, B2, B3;
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
+ const Packet& get(const FixedInt<1>&) const { return B1; }
+ const Packet& get(const FixedInt<2>&) const { return B2; }
+ const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<name ## Scalar>::type, \
+ typename packet_traits<name ## Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+ prefix ## name ## Packet
+
+#define PACKET_DECL_COND(name, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<name ## Scalar>::type, \
+ typename packet_traits<name ## Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+ name ## Packet
+
+#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<Scalar>::type, \
+ typename packet_traits<Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+ prefix ## ScalarPacket
+
+#define PACKET_DECL_COND_SCALAR(packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<Scalar>::type, \
+ typename packet_traits<Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+ ScalarPacket
+
+/* Vectorization logic
+ * real*real: unpack rhs to constant packets, ...
+ *
+ * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ * storing each res packet into two packets (2x2),
+ * at the end combine them: swap the second and addsub them
+ * cf*cf : same but with 2x4 blocks
+ * cplx*real : unpack rhs to constant packets, ...
+ * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits
+{
+public:
+ typedef _LhsScalar LhsScalar;
+ typedef _RhsScalar RhsScalar;
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+ // register block size along the N direction must be 1 or 4
+ nr = 4,
+
+ // register block size along the M direction (currently, this one cannot be modified)
+ default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
+ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
+ // we assume 16 registers or more
+ // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+ // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
+ mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+ mr = default_mr,
+#endif
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = 1
+ };
+
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+ typedef LhsPacket LhsPacket4Packing;
+
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ dest = pset1<RhsPacketType>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ploadquad<RhsPacket>(b);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = pload<LhsPacketType>(a);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploadu<LhsPacketType>(a);
+ }
+
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+ {
+ conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
+ // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+ // let gcc allocate the register in which to store the result of the pmul
+ // (in the case where there is no FMA) gcc fails to figure out how to avoid
+ // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c = cj.pmadd(a,b,c);
+#else
+ tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
+#endif
+ }
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = pmadd(c,alpha,r);
+ }
+
+ template<typename ResPacketHalf>
+ EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+ {
+ r = pmadd(c,alpha,r);
+ }
+
+};
+
+template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
+{
+public:
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef RealScalar RhsScalar;
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = false,
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+ // we assume 16 registers
+ mr = 3*LhsPacketSize,
+#else
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+ typedef LhsPacket LhsPacket4Packing;
+
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ dest = pset1<RhsPacketType>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
+ {
+ // FIXME we can do better!
+ // what we want here is a ploadheight
+ RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
+ dest = ploadquad<RhsPacket>(tmp);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
+ {
+ eigen_internal_assert(RhsPacketSize<=8);
+ dest = pset1<RhsPacket>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = pload<LhsPacket>(a);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploadu<LhsPacketType>(a);
+ }
+
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+ {
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+ }
+
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+ {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a.v,b,c.v);
+#else
+ tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+ {
+ c += a * b;
+ }
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ template <typename ResPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+ {
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
+ r = cj.pmadd(c,alpha,r);
+ }
+
+protected:
+};
+
+template<typename Packet>
+struct DoublePacket
+{
+ Packet first;
+ Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+ DoublePacket<Packet> res;
+ res.first = padd(a.first, b.first);
+ res.second = padd(a.second,b.second);
+ return res;
+}
+
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
+template<typename Packet>
+const DoublePacket<Packet>&
+predux_half_dowto4(const DoublePacket<Packet> &a,
+ typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
+{
+ return a;
+}
+
+template<typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half>
+predux_half_dowto4(const DoublePacket<Packet> &a,
+ typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
+{
+ // yes, that's pretty hackish :(
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+ typedef typename packet_traits<Cplx>::type CplxPacket;
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+ return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+ typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
+{
+ dest.first = pset1<RealPacket>(numext::real(*b));
+ dest.second = pset1<RealPacket>(numext::imag(*b));
+}
+
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+ typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
+{
+ // yes, that's pretty hackish too :(
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
+ RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
+ dest.first = ploadquad<RealPacket>(r);
+ dest.second = ploadquad<RealPacket>(i);
+}
+
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+};
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+// DoublePacket<Packet> res;
+// res.first = padd(a.first, b.first);
+// res.second = padd(a.second,b.second);
+// return res;
+// }
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef std::complex<RealScalar> RhsScalar;
+ typedef std::complex<RealScalar> ResScalar;
+
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+ PACKET_DECL_COND(Real, _PacketSize);
+ PACKET_DECL_COND_SCALAR(_PacketSize);
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable
+ && unpacket_traits<ScalarPacket>::vectorizable,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+
+ // FIXME: should depend on NumberOfRegisters
+ nr = 4,
+ mr = ResPacketSize,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef DoublePacket<RealPacket> DoublePacketType;
+
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
+ typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+ // this actualy holds 8 packets!
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
+ {
+ p.first = pset1<RealPacket>(RealScalar(0));
+ p.second = pset1<RealPacket>(RealScalar(0));
+ }
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
+ {
+ dest = pset1<ScalarPacket>(*b);
+ }
+
+ // Vectorized path
+ template<typename RealPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+ {
+ dest.first = pset1<RealPacketType>(numext::real(*b));
+ dest.second = pset1<RealPacketType>(numext::imag(*b));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ loadRhs(b, dest.B_0);
+ loadRhs(b + 1, dest.B1);
+ loadRhs(b + 2, dest.B2);
+ loadRhs(b + 3, dest.B3);
+ }
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ // Vectorized path
+ template<typename RealPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+ {
+ loadQuadToDoublePacket(b,dest);
+ }
+
+ // nothing special here
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
+ }
+
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+ EIGEN_STRONG_INLINE
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
+ {
+ c.first = padd(pmul(a,b.first), c.first);
+ c.second = padd(pmul(a,b.second),c.second);
+ }
+
+ template<typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
+ {
+ c = cj.pmadd(a,b,c);
+ }
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+
+ template<typename RealPacketType, typename ResPacketType>
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
+ {
+ // assemble c
+ ResPacketType tmp;
+ if((!ConjLhs)&&(!ConjRhs))
+ {
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
+ tmp = padd(ResPacketType(c.first),tmp);
+ }
+ else if((!ConjLhs)&&(ConjRhs))
+ {
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
+ tmp = padd(ResPacketType(c.first),tmp);
+ }
+ else if((ConjLhs)&&(!ConjRhs))
+ {
+ tmp = pcplxflip(ResPacketType(c.second));
+ tmp = padd(pconj(ResPacketType(c.first)),tmp);
+ }
+ else if((ConjLhs)&&(ConjRhs))
+ {
+ tmp = pcplxflip(ResPacketType(c.second));
+ tmp = psub(pconj(ResPacketType(c.first)),tmp);
+ }
+
+ r = pmadd(tmp,alpha,r);
+ }
+
+protected:
+ conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef RealScalar LhsScalar;
+ typedef Scalar RhsScalar;
+ typedef Scalar ResScalar;
+
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
+ PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
+
+#undef PACKET_DECL_COND_SCALAR_PREFIX
+#undef PACKET_DECL_COND_PREFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
+
+ enum {
+ ConjLhs = false,
+ ConjRhs = _ConjRhs,
+ Vectorizable = unpacket_traits<_RealPacket>::vectorizable
+ && unpacket_traits<_ScalarPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ // FIXME: should depend on NumberOfRegisters
+ nr = 4,
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+ typedef LhsPacket LhsPacket4Packing;
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ dest = pset1<RhsPacketType>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ploaddup<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ploadquad<RhsPacket>(b);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploaddup<LhsPacketType>(a);
+ }
+
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+ {
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+ }
+
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+ {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a,b.v,c.v);
+#else
+ tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+ {
+ c += a * b;
+ }
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ template <typename ResPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+ {
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
+ r = cj.pmadd(alpha,c,r);
+ }
+
+protected:
+
+};
+
+/* optimized General packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ * | A | B | comments
+ * |real |cplx | no vectorization yet, would require to pack A with duplication
+ * |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel
+{
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
+
+ typedef typename Traits::ResScalar ResScalar;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::AccPacket AccPacket;
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+ typedef typename SwappedTraits::ResScalar SResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
+ typedef typename DataMapper::LinearMapper LinearMapper;
+
+ enum {
+ Vectorizable = Traits::Vectorizable,
+ LhsProgress = Traits::LhsProgress,
+ LhsProgressHalf = HalfTraits::LhsProgress,
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
+ RhsProgress = Traits::RhsProgress,
+ RhsProgressHalf = HalfTraits::RhsProgress,
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
+ ResPacketSize = Traits::ResPacketSize
+ };
+
+ EIGEN_DONT_INLINE
+ void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
+int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
+struct last_row_process_16_packets
+{
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+ typedef typename Traits::ResScalar ResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+ ResScalar alpha, SAccPacket &C0)
+ {
+ EIGEN_UNUSED_VARIABLE(res);
+ EIGEN_UNUSED_VARIABLE(straits);
+ EIGEN_UNUSED_VARIABLE(blA);
+ EIGEN_UNUSED_VARIABLE(blB);
+ EIGEN_UNUSED_VARIABLE(depth);
+ EIGEN_UNUSED_VARIABLE(endk);
+ EIGEN_UNUSED_VARIABLE(i);
+ EIGEN_UNUSED_VARIABLE(j2);
+ EIGEN_UNUSED_VARIABLE(alpha);
+ EIGEN_UNUSED_VARIABLE(C0);
+ }
+};
+
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+ typedef typename Traits::ResScalar ResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+ ResScalar alpha, SAccPacket &C0)
+ {
+ typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
+ typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
+ typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
+ typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
+
+ SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
+ SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
+
+ if (depth - endk > 0)
+ {
+ // We have to handle the last row(s) of the rhs, which
+ // correspond to a half-packet
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+
+ for (Index kk = endk; kk < depth; kk++)
+ {
+ SLhsPacketQuarter a0;
+ SRhsPacketQuarter b0;
+ straits.loadLhsUnaligned(blB, a0);
+ straits.loadRhs(blA, b0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
+ blB += SwappedTraits::LhsProgress/4;
+ blA += 1;
+ }
+ straits.acc(c0, alphav, R);
+ }
+ else
+ {
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+ }
+ res.scatterPacket(i, j2, R);
+ }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet
+{
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+ {
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+ __asm__ ("" : "+x,m" (*A0));
+ #endif
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+ }
+
+ EIGEN_STRONG_INLINE void operator()(
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+ {
+ GEBPTraits traits;
+
+ // loops on each largest micro horizontal panel of lhs
+ // (LhsProgress x depth)
+ for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
+ {
+ // loops on each largest micro vertical panel of rhs (depth * nr)
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // We select a LhsProgress x nr micro block of res
+ // which is entirely stored into 1 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3;
+ traits.initAcc(C0);
+ traits.initAcc(C1);
+ traits.initAcc(C2);
+ traits.initAcc(C3);
+ // To improve instruction pipelining, let's double the accumulation registers:
+ // even k will accumulate in C*, while odd k will accumulate in D*.
+ // This trick is crutial to get good performance with FMA, otherwise it is
+ // actually faster to perform separated MUL+ADD because of a naturally
+ // better instruction-level parallelism.
+ AccPacket D0, D1, D2, D3;
+ traits.initAcc(D0);
+ traits.initAcc(D1);
+ traits.initAcc(D2);
+ traits.initAcc(D3);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+ LhsPacket A0, A1;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+
+ internal::prefetch(blB+(48+0));
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ internal::prefetch(blB+(48+16));
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
+ }
+ C0 = padd(C0,D0);
+ C1 = padd(C1,D1);
+ C2 = padd(C2,D2);
+ C3 = padd(C3,D3);
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ blB += 4*RhsProgress;
+ blA += LhsProgress;
+ }
+
+ ResPacket R0, R1;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0);
+ R1 = r1.template loadPacket<ResPacket>(0);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C1, alphav, R1);
+ r0.storePacket(0, R0);
+ r1.storePacket(0, R1);
+
+ R0 = r2.template loadPacket<ResPacket>(0);
+ R1 = r3.template loadPacket<ResPacket>(0);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C3, alphav, R1);
+ r2.storePacket(0, R0);
+ r3.storePacket(0, R1);
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0;
+ traits.initAcc(C0);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ LhsPacket A0;
+
+ for(Index k= 0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+ RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ /* FIXME: why unaligned???? */ \
+ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
+ } while(false);
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*RhsProgress;
+ blA += pk*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+ R0 = r0.template loadPacket<ResPacket>(0);
+ traits.acc(C0, alphav, R0);
+ r0.storePacket(0, R0);
+ }
+ }
+ }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
+{
+
+EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+ {
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+ traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
+ traits.madd(*A0, *B_0, *C0, *B_0);
+ traits.madd(*A0, *B1, *C1, *B1);
+ traits.madd(*A0, *B2, *C2, *B2);
+ traits.madd(*A0, *B3, *C3, *B3);
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+ }
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+ ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA, Index strideB, Index offsetA, Index offsetB)
+ {
+ Traits traits;
+ SwappedTraits straits;
+
+ if(strideA==-1) strideA = depth;
+ if(strideB==-1) strideB = depth;
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+ const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+ const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
+ const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
+ const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+ const Index peeled_kc = depth & ~(pk-1);
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
+// const Index depth2 = depth & ~1;
+
+ //---------- Process 3 * LhsProgress rows at once ----------
+ // This corresponds to 3*LhsProgress x nr register blocks.
+ // Usually, make sense only with FMA
+ if(mr>=3*Traits::LhsProgress)
+ {
+ // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+ // and on each largest micro vertical panel of the rhs (depth * nr).
+ // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+ // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+ // This actual number of rows is computed as follow:
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+ // or because we are testing specific blocking sizes.
+ const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+ for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
+ {
+ const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+ {
+
+ // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
+ // stored into 3 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3,
+ C4, C5, C6, C7,
+ C8, C9, C10, C11;
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+ traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(0);
+ r1.prefetch(0);
+ r2.prefetch(0);
+ r3.prefetch(0);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+ LhsPacket A0, A1;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+ // 15 registers are taken (12 for acc, 2 for lhs).
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
+ LhsPacket A2;
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+ // without this workaround A0, A1, and A2 are loaded in the same register,
+ // which is not good for pipelining
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+ #else
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+ #endif
+#define EIGEN_GEBP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
+ } /* Bug 953 */ \
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+ } while (false)
+
+ internal::prefetch(blB);
+ EIGEN_GEBP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(1);
+ EIGEN_GEBP_ONESTEP(2);
+ EIGEN_GEBP_ONESTEP(3);
+ EIGEN_GEBP_ONESTEP(4);
+ EIGEN_GEBP_ONESTEP(5);
+ EIGEN_GEBP_ONESTEP(6);
+ EIGEN_GEBP_ONESTEP(7);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*3*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
+ }
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
+ LhsPacket A2;
+ EIGEN_GEBP_ONESTEP(0);
+ blB += 4*RhsProgress;
+ blA += 3*Traits::LhsProgress;
+ }
+
+#undef EIGEN_GEBP_ONESTEP
+
+ ResPacket R0, R1, R2;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C8, alphav, R2);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+ traits.acc(C1, alphav, R0);
+ traits.acc(C5, alphav, R1);
+ traits.acc(C9, alphav, R2);
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C6, alphav, R1);
+ traits.acc(C10, alphav, R2);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+ traits.acc(C3, alphav, R0);
+ traits.acc(C7, alphav, R1);
+ traits.acc(C11, alphav, R2);
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
+ }
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C4, C8;
+ traits.initAcc(C0);
+ traits.initAcc(C4);
+ traits.initAcc(C8);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(0);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ LhsPacket A0, A1, A2;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+ RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+ } while (false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += int(pk) * int(RhsProgress);
+ blA += int(pk) * 3 * int(Traits::LhsProgress);
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += 3*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0, R1, R2;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C8, alphav, R2);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
+ }
+ }
+ }
+ }
+
+ //---------- Process 2 * LhsProgress rows at once ----------
+ if(mr>=2*Traits::LhsProgress)
+ {
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+ // or because we are testing specific blocking sizes.
+ Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+ for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
+ {
+ Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+ {
+
+ // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
+ // stored into 2 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3,
+ C4, C5, C6, C7;
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+ LhsPacket A0, A1;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+
+ // NOTE: the begin/end asm comments below work around bug 935!
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
+ #else
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
+ #endif
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
+ } while (false)
+
+ internal::prefetch(blB+(48+0));
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ internal::prefetch(blB+(48+16));
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*(2*Traits::LhsProgress);
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
+ }
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += 4*RhsProgress;
+ blA += 2*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+
+ ResPacket R0, R1, R2, R3;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C1, alphav, R2);
+ traits.acc(C5, alphav, R3);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C6, alphav, R1);
+ traits.acc(C3, alphav, R2);
+ traits.acc(C7, alphav, R3);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
+ }
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C4;
+ traits.initAcc(C0);
+ traits.initAcc(C4);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(prefetch_res_offset);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ LhsPacket A0, A1;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+ RhsPacket B_0, B1;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
+ } while(false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += int(pk) * int(RhsProgress);
+ blA += int(pk) * 2 * int(Traits::LhsProgress);
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0, B1;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += 2*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0, R1;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ }
+ }
+ }
+ }
+ //---------- Process 1 * LhsProgress rows at once ----------
+ if(mr>=1*Traits::LhsProgress)
+ {
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+ }
+ //---------- Process LhsProgressHalf rows at once ----------
+ if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
+ {
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+ }
+ //---------- Process LhsProgressQuarter rows at once ----------
+ if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
+ {
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+ }
+ //---------- Process remaining rows, 1 at once ----------
+ if(peeled_mc_quarter<rows)
+ {
+ // loop on each panel of the rhs
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // loop on each row of the lhs (1*LhsProgress x depth)
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+
+ // If LhsProgress is 8 or 16, it assumes that there is a
+ // half or quarter packet, respectively, of the same size as
+ // nr (which is currently 4) for the return type.
+ const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
+ const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
+ if ((SwappedTraits::LhsProgress % 4) == 0 &&
+ (SwappedTraits::LhsProgress<=16) &&
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
+ (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
+ {
+ SAccPacket C0, C1, C2, C3;
+ straits.initAcc(C0);
+ straits.initAcc(C1);
+ straits.initAcc(C2);
+ straits.initAcc(C3);
+
+ const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
+ const Index endk = (depth/spk)*spk;
+ const Index endk4 = (depth/(spk*4))*(spk*4);
+
+ Index k=0;
+ for(; k<endk4; k+=4*spk)
+ {
+ SLhsPacket A0,A1;
+ SRhsPacket B_0,B_1;
+
+ straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+ straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+
+ straits.loadRhsQuad(blA+0*spk, B_0);
+ straits.loadRhsQuad(blA+1*spk, B_1);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
+
+ straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
+ straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
+ straits.loadRhsQuad(blA+2*spk, B_0);
+ straits.loadRhsQuad(blA+3*spk, B_1);
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
+
+ blB += 4*SwappedTraits::LhsProgress;
+ blA += 4*spk;
+ }
+ C0 = padd(padd(C0,C1),padd(C2,C3));
+ for(; k<endk; k+=spk)
+ {
+ SLhsPacket A0;
+ SRhsPacket B_0;
+
+ straits.loadLhsUnaligned(blB, A0);
+ straits.loadRhsQuad(blA, B_0);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
+
+ blB += SwappedTraits::LhsProgress;
+ blA += spk;
+ }
+ if(SwappedTraits::LhsProgress==8)
+ {
+ // Special case where we have to first reduce the accumulation register C0
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+ SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+ if(depth-endk>0)
+ {
+ // We have to handle the last row of the rhs which corresponds to a half-packet
+ SLhsPacketHalf a0;
+ SRhsPacketHalf b0;
+ straits.loadLhsUnaligned(blB, a0);
+ straits.loadRhs(blA, b0);
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
+ straits.acc(c0, alphav, R);
+ }
+ else
+ {
+ straits.acc(predux_half_dowto4(C0), alphav, R);
+ }
+ res.scatterPacket(i, j2, R);
+ }
+ else if (SwappedTraits::LhsProgress==16)
+ {
+ // Special case where we have to first reduce the
+ // accumulation register C0. We specialize the block in
+ // template form, so that LhsProgress < 16 paths don't
+ // fail to compile
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
+ p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
+ }
+ else
+ {
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+ SResPacket alphav = pset1<SResPacket>(alpha);
+ straits.acc(C0, alphav, R);
+ res.scatterPacket(i, j2, R);
+ }
+ }
+ else // scalar path
+ {
+ // get a 1 x 4 res block as registers
+ ResScalar C0(0), C1(0), C2(0), C3(0);
+
+ for(Index k=0; k<depth; k++)
+ {
+ LhsScalar A0;
+ RhsScalar B_0, B_1;
+
+ A0 = blA[k];
+
+ B_0 = blB[0];
+ B_1 = blB[1];
+ C0 = cj.pmadd(A0,B_0,C0);
+ C1 = cj.pmadd(A0,B_1,C1);
+
+ B_0 = blB[2];
+ B_1 = blB[3];
+ C2 = cj.pmadd(A0,B_0,C2);
+ C3 = cj.pmadd(A0,B_1,C3);
+
+ blB += 4;
+ }
+ res(i, j2 + 0) += alpha * C0;
+ res(i, j2 + 1) += alpha * C1;
+ res(i, j2 + 2) += alpha * C2;
+ res(i, j2 + 3) += alpha * C3;
+ }
+ }
+ }
+ // remaining columns
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // loop on each row of the lhs (1*LhsProgress x depth)
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
+ // gets a 1 x 1 res block as registers
+ ResScalar C0(0);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ for(Index k=0; k<depth; k++)
+ {
+ LhsScalar A0 = blA[k];
+ RhsScalar B_0 = blB[k];
+ C0 = cj.pmadd(A0, B_0, C0);
+ }
+ res(i, j2) += alpha * C0;
+ }
+ }
+ }
+ }
+
+
+// pack a block of the lhs
+// The traversal is as follow (mr==4):
+// 0 4 8 12 ...
+// 1 5 9 13 ...
+// 2 6 10 14 ...
+// 3 7 11 15 ...
+//
+// 16 20 24 28 ...
+// 17 21 25 29 ...
+// 18 22 26 30 ...
+// 19 23 27 31 ...
+//
+// 32 33 34 35 ...
+// 36 36 38 39 ...
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+ enum { PacketSize = unpacket_traits<Packet>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index count = 0;
+
+ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+ const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
+ : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
+
+ Index i=0;
+
+ // Pack 3 packets
+ if(Pack1>=3*PacketSize)
+ {
+ for(; i<peeled_mc3; i+=3*PacketSize)
+ {
+ if(PanelMode) count += (3*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A, B, C;
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+ C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
+ pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+ pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+ pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
+ }
+ if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack 2 packets
+ if(Pack1>=2*PacketSize)
+ {
+ for(; i<peeled_mc2; i+=2*PacketSize)
+ {
+ if(PanelMode) count += (2*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A, B;
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+ pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+ pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+ }
+ if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack 1 packets
+ if(Pack1>=1*PacketSize)
+ {
+ for(; i<peeled_mc1; i+=1*PacketSize)
+ {
+ if(PanelMode) count += (1*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A;
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+ pstore(blockA+count, cj.pconj(A));
+ count+=PacketSize;
+ }
+ if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack half packets
+ if(HasHalf && Pack1>=HalfPacketSize)
+ {
+ for(; i<peeled_mc_half; i+=HalfPacketSize)
+ {
+ if(PanelMode) count += (HalfPacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ HalfPacket A;
+ A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
+ pstoreu(blockA+count, cj.pconj(A));
+ count+=HalfPacketSize;
+ }
+ if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack quarter packets
+ if(HasQuarter && Pack1>=QuarterPacketSize)
+ {
+ for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
+ {
+ if(PanelMode) count += (QuarterPacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ QuarterPacket A;
+ A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
+ pstoreu(blockA+count, cj.pconj(A));
+ count+=QuarterPacketSize;
+ }
+ if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack2 may be *smaller* than PacketSize—that happens for
+ // products like real * complex, where we have to go half the
+ // progress on the lhs in order to duplicate those operands to
+ // address both real & imaginary parts on the rhs. This portion will
+ // pack those half ones until they match the number expected on the
+ // last peeling loop at this point (for the rhs).
+ if(Pack2<PacketSize && Pack2>1)
+ {
+ for(; i<peeled_mc0; i+=last_lhs_progress)
+ {
+ if(PanelMode) count += last_lhs_progress * offset;
+
+ for(Index k=0; k<depth; k++)
+ for(Index w=0; w<last_lhs_progress; w++)
+ blockA[count++] = cj(lhs(i+w, k));
+
+ if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
+ }
+ }
+ // Pack scalars
+ for(; i<rows; i++)
+ {
+ if(PanelMode) count += offset;
+ for(Index k=0; k<depth; k++)
+ blockA[count++] = cj(lhs(i, k));
+ if(PanelMode) count += (stride-offset-depth);
+ }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+ enum { PacketSize = unpacket_traits<Packet>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index count = 0;
+ bool gone_half = false, gone_quarter = false, gone_last = false;
+
+ Index i = 0;
+ int pack = Pack1;
+ int psize = PacketSize;
+ while(pack>0)
+ {
+ Index remaining_rows = rows-i;
+ Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
+ Index starting_pos = i;
+ for(; i<peeled_mc; i+=pack)
+ {
+ if(PanelMode) count += pack * offset;
+
+ Index k=0;
+ if(pack>=psize && psize >= QuarterPacketSize)
+ {
+ const Index peeled_k = (depth/psize)*psize;
+ for(; k<peeled_k; k+=psize)
+ {
+ for (Index m = 0; m < pack; m += psize)
+ {
+ if (psize == PacketSize) {
+ PacketBlock<Packet> kernel;
+ for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+ ptranspose(kernel);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+ } else if (HasHalf && psize == HalfPacketSize) {
+ gone_half = true;
+ PacketBlock<HalfPacket> kernel_half;
+ for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+ ptranspose(kernel_half);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+ } else if (HasQuarter && psize == QuarterPacketSize) {
+ gone_quarter = true;
+ PacketBlock<QuarterPacket> kernel_quarter;
+ for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+ ptranspose(kernel_quarter);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+ }
+ }
+ count += psize*pack;
+ }
+ }
+
+ for(; k<depth; k++)
+ {
+ Index w=0;
+ for(; w<pack-3; w+=4)
+ {
+ Scalar a(cj(lhs(i+w+0, k))),
+ b(cj(lhs(i+w+1, k))),
+ c(cj(lhs(i+w+2, k))),
+ d(cj(lhs(i+w+3, k)));
+ blockA[count++] = a;
+ blockA[count++] = b;
+ blockA[count++] = c;
+ blockA[count++] = d;
+ }
+ if(pack%4)
+ for(;w<pack;++w)
+ blockA[count++] = cj(lhs(i+w, k));
+ }
+
+ if(PanelMode) count += pack * (stride-offset-depth);
+ }
+
+ pack -= psize;
+ Index left = rows - i;
+ if (pack <= 0) {
+ if (!gone_last &&
+ (starting_pos == i || left >= psize/2 || left >= psize/4) &&
+ ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
+ (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+ psize /= 2;
+ pack = psize;
+ continue;
+ }
+ // Pack2 may be *smaller* than PacketSize—that happens for
+ // products like real * complex, where we have to go half the
+ // progress on the lhs in order to duplicate those operands to
+ // address both real & imaginary parts on the rhs. This portion will
+ // pack those half ones until they match the number expected on the
+ // last peeling loop at this point (for the rhs).
+ if (Pack2 < PacketSize && !gone_last) {
+ gone_last = true;
+ psize = pack = left & ~1;
+ }
+ }
+ }
+
+ for(; i<rows; i++)
+ {
+ if(PanelMode) count += offset;
+ for(Index k=0; k<depth; k++)
+ blockA[count++] = cj(lhs(i, k));
+ if(PanelMode) count += (stride-offset-depth);
+ }
+}
+
+// copy a complete panel of the rhs
+// this version is optimized for column major matrices
+// The traversal order is as follow: (nr==4):
+// 0 1 2 3 12 13 14 15 24 27
+// 4 5 6 7 16 17 18 19 25 28
+// 8 9 10 11 20 21 22 23 26 29
+// . . . . . . . . . .
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ enum { PacketSize = packet_traits<Scalar>::size };
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+ Index count = 0;
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
+// if(nr>=8)
+// {
+// for(Index j2=0; j2<packet_cols8; j2+=8)
+// {
+// // skip what we have before
+// if(PanelMode) count += 8 * offset;
+// const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+// const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+// const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+// const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+// const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+// const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+// Index k=0;
+// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
+// {
+// for(; k<peeled_k; k+=PacketSize) {
+// PacketBlock<Packet> kernel;
+// for (int p = 0; p < PacketSize; ++p) {
+// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+// }
+// ptranspose(kernel);
+// for (int p = 0; p < PacketSize; ++p) {
+// pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+// count+=PacketSize;
+// }
+// }
+// }
+// for(; k<depth; k++)
+// {
+// blockB[count+0] = cj(b0[k]);
+// blockB[count+1] = cj(b1[k]);
+// blockB[count+2] = cj(b2[k]);
+// blockB[count+3] = cj(b3[k]);
+// blockB[count+4] = cj(b4[k]);
+// blockB[count+5] = cj(b5[k]);
+// blockB[count+6] = cj(b6[k]);
+// blockB[count+7] = cj(b7[k]);
+// count += 8;
+// }
+// // skip what we have after
+// if(PanelMode) count += 8 * (stride-offset-depth);
+// }
+// }
+
+ if(nr>=4)
+ {
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+ {
+ // skip what we have before
+ if(PanelMode) count += 4 * offset;
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+ Index k=0;
+ if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
+ kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
+ kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
+ kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
+ kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
+ ptranspose(kernel);
+ pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
+ pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+ pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+ pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
+ count+=4*PacketSize;
+ }
+ }
+ for(; k<depth; k++)
+ {
+ blockB[count+0] = cj(dm0(k));
+ blockB[count+1] = cj(dm1(k));
+ blockB[count+2] = cj(dm2(k));
+ blockB[count+3] = cj(dm3(k));
+ count += 4;
+ }
+ // skip what we have after
+ if(PanelMode) count += 4 * (stride-offset-depth);
+ }
+ }
+
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ if(PanelMode) count += offset;
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+ for(Index k=0; k<depth; k++)
+ {
+ blockB[count] = cj(dm0(k));
+ count += 1;
+ }
+ if(PanelMode) count += (stride-offset-depth);
+ }
+}
+
+// this version is optimized for row major matrices
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ enum { PacketSize = packet_traits<Scalar>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
+ {
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+ const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+ Index count = 0;
+
+ // if(nr>=8)
+ // {
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
+ // {
+ // // skip what we have before
+ // if(PanelMode) count += 8 * offset;
+ // for(Index k=0; k<depth; k++)
+ // {
+ // if (PacketSize==8) {
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+ // pstoreu(blockB+count, cj.pconj(A));
+ // } else if (PacketSize==4) {
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+ // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+ // pstoreu(blockB+count, cj.pconj(A));
+ // pstoreu(blockB+count+PacketSize, cj.pconj(B));
+ // } else {
+ // const Scalar* b0 = &rhs[k*rhsStride + j2];
+ // blockB[count+0] = cj(b0[0]);
+ // blockB[count+1] = cj(b0[1]);
+ // blockB[count+2] = cj(b0[2]);
+ // blockB[count+3] = cj(b0[3]);
+ // blockB[count+4] = cj(b0[4]);
+ // blockB[count+5] = cj(b0[5]);
+ // blockB[count+6] = cj(b0[6]);
+ // blockB[count+7] = cj(b0[7]);
+ // }
+ // count += 8;
+ // }
+ // // skip what we have after
+ // if(PanelMode) count += 8 * (stride-offset-depth);
+ // }
+ // }
+ if(nr>=4)
+ {
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+ {
+ // skip what we have before
+ if(PanelMode) count += 4 * offset;
+ for(Index k=0; k<depth; k++)
+ {
+ if (PacketSize==4) {
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
+ pstoreu(blockB+count, cj.pconj(A));
+ count += PacketSize;
+ } else if (HasHalf && HalfPacketSize==4) {
+ HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+ pstoreu(blockB+count, cj.pconj(A));
+ count += HalfPacketSize;
+ } else if (HasQuarter && QuarterPacketSize==4) {
+ QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+ pstoreu(blockB+count, cj.pconj(A));
+ count += QuarterPacketSize;
+ } else {
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+ blockB[count+0] = cj(dm0(0));
+ blockB[count+1] = cj(dm0(1));
+ blockB[count+2] = cj(dm0(2));
+ blockB[count+3] = cj(dm0(3));
+ count += 4;
+ }
+ }
+ // skip what we have after
+ if(PanelMode) count += 4 * (stride-offset-depth);
+ }
+ }
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ if(PanelMode) count += offset;
+ for(Index k=0; k<depth; k++)
+ {
+ blockB[count] = cj(rhs(k, j2));
+ count += 1;
+ }
+ if(PanelMode) count += stride-offset-depth;
+ }
+ }
+};
+
+} // end namespace internal
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l2;
+}
+
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
+rs.
+* \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l3;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+ * These values are use to adjust the size of the blocks
+ * for the algorithms working per blocks.
+ *
+ * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
+{
+ internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
new file mode 100644
index 000000000..caa65fccc
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,517 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+
+/* Specialization for a row-major destination matrix => simple transposition of the product */
+template<
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
+{
+ typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols, Index depth,
+ const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsStride,
+ ResScalar* res, Index resIncr, Index resStride,
+ ResScalar alpha,
+ level3_blocking<RhsScalar,LhsScalar>& blocking,
+ GemmParallelInfo<Index>* info = 0)
+ {
+ // transpose the product such that the result is column major
+ general_matrix_matrix_product<Index,
+ RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+ LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+ ColMajor,ResInnerStride>
+ ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
+ }
+};
+
+/* Specialization for a col-major destination matrix
+ * => Blocking algorithm following Goto's paper */
+template<
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+static void run(Index rows, Index cols, Index depth,
+ const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* _res, Index resIncr, Index resStride,
+ ResScalar alpha,
+ level3_blocking<LhsScalar,RhsScalar>& blocking,
+ GemmParallelInfo<Index>* info = 0)
+{
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs, lhsStride);
+ RhsMapper rhs(_rhs, rhsStride);
+ ResMapper res(_res, resStride, resIncr);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+ Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
+
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#ifdef EIGEN_HAS_OPENMP
+ if(info)
+ {
+ // this is the parallel version!
+ int tid = omp_get_thread_num();
+ int threads = omp_get_num_threads();
+
+ LhsScalar* blockA = blocking.blockA();
+ eigen_internal_assert(blockA!=0);
+
+ std::size_t sizeB = kc*nc;
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+
+ // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+ for(Index k=0; k<depth; k+=kc)
+ {
+ const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+ // In order to reduce the chance that a thread has to wait for the other,
+ // let's start by packing B'.
+ pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
+
+ // Pack A_k to A' in a parallel fashion:
+ // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+
+ // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+ // i.e., we test that info[tid].users equals 0.
+ // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+ while(info[tid].users!=0) {}
+ info[tid].users = threads;
+
+ pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+
+ // Notify the other threads that the part A'_i is ready to go.
+ info[tid].sync = k;
+
+ // Computes C_i += A' * B' per A'_i
+ for(int shift=0; shift<threads; ++shift)
+ {
+ int i = (tid+shift)%threads;
+
+ // At this point we have to make sure that A'_i has been updated by the thread i,
+ // we use testAndSetOrdered to mimic a volatile access.
+ // However, no need to wait for the B' part which has been updated by the current thread!
+ if (shift>0) {
+ while(info[i].sync!=k) {
+ }
+ }
+
+ gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+ }
+
+ // Then keep going as usual with the remaining B'
+ for(Index j=nc; j<cols; j+=nc)
+ {
+ const Index actual_nc = (std::min)(j+nc,cols)-j;
+
+ // pack B_k,j to B'
+ pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
+
+ // C_j += A' * B'
+ gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
+ }
+
+ // Release all the sub blocks A'_i of A' for the current thread,
+ // i.e., we simply decrement the number of users by 1
+ for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
+ #pragma omp atomic
+#endif
+ info[i].users -= 1;
+ }
+ }
+ else
+#endif // EIGEN_HAS_OPENMP
+ {
+ EIGEN_UNUSED_VARIABLE(info);
+
+ // this is the sequential version!
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*nc;
+
+ ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+ const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
+
+ // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+
+ for(Index k2=0; k2<depth; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+ // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+ // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+ // Note that this panel will be read as many times as the number of blocks in the rhs's
+ // horizontal panel which is, in practice, a very low number.
+ pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+
+ // For each kc x nc block of the rhs's horizontal panel...
+ for(Index j2=0; j2<cols; j2+=nc)
+ {
+ const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+
+ // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+ // Note that this block will be read a very high number of times, which is equal to the number of
+ // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+ if((!pack_rhs_once) || i2==0)
+ pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+
+ // Everything is packed, we can now call the panel * block kernel:
+ gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+ }
+ }
+ }
+ }
+}
+
+};
+
+/*********************************************************************************
+* Specialization of generic_product_impl for "large" GEMM, i.e.,
+* implementation of the high level wrapper to general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
+struct gemm_functor
+{
+ gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+ : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
+ {}
+
+ void initParallelSession(Index num_threads) const
+ {
+ m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
+ m_blocking.allocateA();
+ }
+
+ void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
+ {
+ if(cols==-1)
+ cols = m_rhs.cols();
+
+ Gemm::run(rows, cols, m_lhs.cols(),
+ &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+ &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+ (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
+ m_actualAlpha, m_blocking, info);
+ }
+
+ typedef typename Gemm::Traits Traits;
+
+ protected:
+ const Lhs& m_lhs;
+ const Rhs& m_rhs;
+ Dest& m_dest;
+ Scalar m_actualAlpha;
+ BlockingType& m_blocking;
+};
+
+template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
+bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+
+template<typename _LhsScalar, typename _RhsScalar>
+class level3_blocking
+{
+ typedef _LhsScalar LhsScalar;
+ typedef _RhsScalar RhsScalar;
+
+ protected:
+ LhsScalar* m_blockA;
+ RhsScalar* m_blockB;
+
+ Index m_mc;
+ Index m_nc;
+ Index m_kc;
+
+ public:
+
+ level3_blocking()
+ : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
+ {}
+
+ inline Index mc() const { return m_mc; }
+ inline Index nc() const { return m_nc; }
+ inline Index kc() const { return m_kc; }
+
+ inline LhsScalar* blockA() { return m_blockA; }
+ inline RhsScalar* blockB() { return m_blockB; }
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
+ : public level3_blocking<
+ typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+ typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+ enum {
+ Transpose = StorageOrder==RowMajor,
+ ActualRows = Transpose ? MaxCols : MaxRows,
+ ActualCols = Transpose ? MaxRows : MaxCols
+ };
+ typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+ typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ enum {
+ SizeA = ActualRows * MaxDepth,
+ SizeB = ActualCols * MaxDepth
+ };
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+ EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+ EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+ EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+ EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif
+
+ public:
+
+ gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
+ {
+ this->m_mc = ActualRows;
+ this->m_nc = ActualCols;
+ this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+ this->m_blockA = m_staticA;
+ this->m_blockB = m_staticB;
+#else
+ this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+ this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
+ }
+
+ void initParallel(Index, Index, Index, Index)
+ {}
+
+ inline void allocateA() {}
+ inline void allocateB() {}
+ inline void allocateAll() {}
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
+ : public level3_blocking<
+ typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+ typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+ enum {
+ Transpose = StorageOrder==RowMajor
+ };
+ typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+ typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+ Index m_sizeA;
+ Index m_sizeB;
+
+ public:
+
+ gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
+ {
+ this->m_mc = Transpose ? cols : rows;
+ this->m_nc = Transpose ? rows : cols;
+ this->m_kc = depth;
+
+ if(l3_blocking)
+ {
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+ }
+ else // no l3 blocking
+ {
+ Index n = this->m_nc;
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
+ }
+
+ m_sizeA = this->m_mc * this->m_kc;
+ m_sizeB = this->m_kc * this->m_nc;
+ }
+
+ void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+ {
+ this->m_mc = Transpose ? cols : rows;
+ this->m_nc = Transpose ? rows : cols;
+ this->m_kc = depth;
+
+ eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
+ Index m = this->m_mc;
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+ m_sizeA = this->m_mc * this->m_kc;
+ m_sizeB = this->m_kc * this->m_nc;
+ }
+
+ void allocateA()
+ {
+ if(this->m_blockA==0)
+ this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+ }
+
+ void allocateB()
+ {
+ if(this->m_blockB==0)
+ this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+ }
+
+ void allocateAll()
+ {
+ allocateA();
+ allocateB();
+ }
+
+ ~gemm_blocking_space()
+ {
+ aligned_delete(this->m_blockA, m_sizeA);
+ aligned_delete(this->m_blockB, m_sizeB);
+ }
+};
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+ : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
+{
+ typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+ typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+ enum {
+ MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+ };
+
+ typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
+
+ template<typename Dst>
+ static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+ {
+ // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+ // to determine the following heuristic.
+ // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+ // unless it has been specialized by the user or for a given architecture.
+ // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+ // I'm not sure it is still required.
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
+ else
+ {
+ dst.setZero();
+ scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+ }
+ }
+
+ template<typename Dst>
+ static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+ {
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
+ else
+ scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+ }
+
+ template<typename Dst>
+ static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+ {
+ if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+ lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+ else
+ scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+ }
+
+ template<typename Dest>
+ static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+ {
+ eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+ if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+ return;
+
+ if (dst.cols() == 1)
+ {
+ // Fallback to GEMV if either the lhs or rhs is a runtime vector
+ typename Dest::ColXpr dst_vec(dst.col(0));
+ return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
+ ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
+ }
+ else if (dst.rows() == 1)
+ {
+ // Fallback to GEMV if either the lhs or rhs is a runtime vector
+ typename Dest::RowXpr dst_vec(dst.row(0));
+ return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
+ ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
+ }
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+ Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
+
+ typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+ Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+ typedef internal::gemm_functor<
+ Scalar, Index,
+ internal::general_matrix_matrix_product<
+ Index,
+ LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+ RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+ (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ Dest::InnerStrideAtCompileTime>,
+ ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
+
+ BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+ internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+ (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
new file mode 100644
index 000000000..6ba0d9bdb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update;
+
+namespace internal {
+
+/**********************************************************************
+* This file implements a general A * B product while
+* evaluating only one triangular part of the product.
+* This is a more general version of self adjoint product (C += A A^T)
+* as the level 3 SYRK Blas routine.
+**********************************************************************/
+
+// forward declarations (defined at the end of this file)
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
+struct tribb_kernel;
+
+/* Optimized matrix-matrix product evaluating only one triangular half */
+template <typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized>
+struct general_matrix_matrix_triangular_product;
+
+// as usual if the result is row major => we transpose the product
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
+{
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
+ const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
+ {
+ general_matrix_matrix_triangular_product<Index,
+ RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+ LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+ ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
+ ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
+ }
+};
+
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
+{
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* _res, Index resIncr, Index resStride,
+ const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
+ {
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
+
+ Index kc = blocking.kc();
+ Index mc = (std::min)(size,blocking.mc());
+
+ // !!! mc must be a multiple of nr:
+ if(mc > Traits::nr)
+ mc = (mc/Traits::nr)*Traits::nr;
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*size;
+
+ ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+ tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
+
+ for(Index k2=0; k2<depth; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+ // note that the actual rhs is the transpose/adjoint of mat
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
+
+ for(Index i2=0; i2<size; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,size)-i2;
+
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ // the selected actual_mc * size panel of res is split into three different part:
+ // 1 - before the diagonal => processed with gebp or skipped
+ // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
+ // 3 - after the diagonal => processed with gebp or skipped
+ if (UpLo==Lower)
+ gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+ (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
+ sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+
+ if (UpLo==Upper)
+ {
+ Index j2 = i2+actual_mc;
+ gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+ actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+};
+
+// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
+// This kernel is built on top of the gebp kernel:
+// - the current destination block is processed per panel of actual_mc x BlockSize
+// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+// the sub blocks above and below the diagonal are processed as usual,
+// while the triangular block overlapping the diagonal is evaluated into a
+// small temporary buffer which is then accumulated into the result using a
+// triangular traversal.
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
+struct tribb_kernel
+{
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+
+ enum {
+ BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
+ };
+ void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+ {
+ typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
+ ResMapper res(_res, resStride, resIncr);
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
+ gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
+
+ Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
+
+ // let's process the block per panel of actual_mc x BlockSize,
+ // again, each is split into three parts, etc.
+ for (Index j=0; j<size; j+=BlockSize)
+ {
+ Index actualBlockSize = std::min<Index>(BlockSize,size - j);
+ const RhsScalar* actual_b = blockB+j*depth;
+
+ if(UpLo==Upper)
+ gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+ -1, -1, 0, 0);
+
+ // selfadjoint micro block
+ {
+ Index i = j;
+ buffer.setZero();
+ // 1 - apply the kernel on the temporary buffer
+ gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+ -1, -1, 0, 0);
+
+ // 2 - triangular accumulation
+ for(Index j1=0; j1<actualBlockSize; ++j1)
+ {
+ typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
+ for(Index i1=UpLo==Lower ? j1 : 0;
+ UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
+ r(i1) += buffer(i1,j1);
+ }
+ }
+
+ if(UpLo==Lower)
+ {
+ Index i = j+actualBlockSize;
+ gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
+ depth, actualBlockSize, alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+};
+
+} // end namespace internal
+
+// high level API
+
+template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+struct general_product_to_triangular_selector;
+
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
+{
+ static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+
+ typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+ typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+ typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+ typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+ typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+ typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+ if(!beta)
+ mat.template triangularView<UpLo>().setZero();
+
+ enum {
+ StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
+ UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+ };
+
+ internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
+ (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+ if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+
+ internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
+ (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+ if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+
+
+ selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+ LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
+ ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+ }
+};
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
+{
+ static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
+ {
+ typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+ typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+ typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+ typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+ typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+ typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+ if(!beta)
+ mat.template triangularView<UpLo>().setZero();
+
+ enum {
+ IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+ LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+ RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
+ SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
+ };
+
+ Index size = mat.cols();
+ if(SkipDiag)
+ size--;
+ Index depth = actualLhs.cols();
+
+ typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+ MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+ BlockingType blocking(size, size, depth, 1, false);
+
+ internal::general_matrix_matrix_triangular_product<Index,
+ typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+ typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+ IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
+ ::run(size, depth,
+ &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
+ &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
+ mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
+ mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
+ }
+};
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename ProductType>
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+{
+ EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+ eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
+
+ general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+
+ return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
new file mode 100644
index 000000000..9a650ec23
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -0,0 +1,145 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+struct general_matrix_matrix_rankupdate :
+ general_matrix_matrix_triangular_product<
+ Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+ const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
+ { \
+ if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
+ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
+ } else { \
+ general_matrix_matrix_triangular_product<Index, \
+ Scalar, LhsStorageOrder, ConjugateLhs, \
+ Scalar, RhsStorageOrder, ConjugateRhs, \
+ ColMajor, 1, UpLo, BuiltIn> \
+ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
+ } \
+ } \
+};
+
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+ enum { \
+ IsLower = (UpLo&Lower) == Lower, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+ }; \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+ BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
+ EIGTYPE beta(1); \
+ BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
+ } \
+};
+
+// HERK for complex data
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+ enum { \
+ IsLower = (UpLo&Lower) == Lower, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+ }; \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+ BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
+ RTYPE alpha_, beta_; \
+ const EIGTYPE* a_ptr; \
+\
+ alpha_ = alpha.real(); \
+ beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+ MatrixType a; \
+ if (conjA) { \
+ Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+ a = mapA.conjugate(); \
+ lda = a.outerStride(); \
+ a_ptr = a.data(); \
+ } else a_ptr=lhs; \
+ BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
+#else
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
+#endif
+
+// TODO hanlde complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..71abf4013
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
+template< \
+ typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
+{ \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
+static void run(Index rows, Index cols, Index depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resIncr, Index resStride, \
+ EIGTYPE alpha, \
+ level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+ GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+ using std::conj; \
+\
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ char transa, transb; \
+ BlasIndex m, n, k, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ EIGTYPE beta(1); \
+ MatrixX##EIGPREFIX a_tmp, b_tmp; \
+\
+/* Set transpose options */ \
+ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(cols); \
+ k = convert_index<BlasIndex>(depth); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ ldb = convert_index<BlasIndex>(rhsStride); \
+ ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+ a_tmp = lhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else a = _lhs; \
+\
+ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.conjugate(); \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+ } else b = _rhs; \
+\
+ BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+}};
+
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double, d, double, dgemm)
+GEMM_SPECIALIZATION(float, f, float, sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
+#else
+GEMM_SPECIALIZATION(double, d, double, dgemm_)
+GEMM_SPECIALIZATION(float, f, float, sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
new file mode 100644
index 000000000..dfb6aebce
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -0,0 +1,518 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum GEMVPacketSizeType {
+ GEMVPacketFull = 0,
+ GEMVPacketHalf,
+ GEMVPacketQuarter
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
+class gemv_traits
+{
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
+ typedef typename gemv_packet_cond<packet_size, \
+ typename packet_traits<name ## Scalar>::type, \
+ typename packet_traits<name ## Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+ prefix ## name ## Packet
+
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+#undef PACKET_DECL_COND_PREFIX
+
+public:
+ enum {
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
+ unpacket_traits<_RhsPacket>::vectorizable &&
+ int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
+ };
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+};
+
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
+ *
+ * Mixing type logic: C += alpha * A * B
+ * | A | B |alpha| comments
+ * |real |cplx |cplx | no vectorization
+ * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * The same reasoning apply for the transposed case.
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+ typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
+
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+ Index rows, Index cols,
+ const LhsMapper& alhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ RhsScalar alpha)
+{
+ EIGEN_UNUSED_VARIABLE(resIncr);
+ eigen_internal_assert(resIncr==1);
+
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate propoer code.
+ LhsMapper lhs(alhs);
+
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+ conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+ conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+
+ const Index lhsStride = lhs.stride();
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum { LhsAlignment = Unaligned,
+ ResPacketSize = Traits::ResPacketSize,
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+ LhsPacketSize = Traits::LhsPacketSize,
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+ };
+
+ const Index n8 = rows-8*ResPacketSize+1;
+ const Index n4 = rows-4*ResPacketSize+1;
+ const Index n3 = rows-3*ResPacketSize+1;
+ const Index n2 = rows-2*ResPacketSize+1;
+ const Index n1 = rows-1*ResPacketSize+1;
+ const Index n_half = rows-1*ResPacketSizeHalf+1;
+ const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
+
+ // TODO: improve the following heuristic:
+ const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
+ ResPacket palpha = pset1<ResPacket>(alpha);
+ ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+ ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+
+ for(Index j2=0; j2<cols; j2+=block_cols)
+ {
+ Index jend = numext::mini(j2+block_cols,cols);
+ Index i=0;
+ for(; i<n8; i+=ResPacketSize*8)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0)),
+ c4 = pset1<ResPacket>(ResScalar(0)),
+ c5 = pset1<ResPacket>(ResScalar(0)),
+ c6 = pset1<ResPacket>(ResScalar(0)),
+ c7 = pset1<ResPacket>(ResScalar(0));
+
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+ c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
+ c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
+ c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
+ c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+ pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
+ pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
+ pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
+ pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
+ }
+ if(i<n4)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0));
+
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+
+ i+=ResPacketSize*4;
+ }
+ if(i<n3)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0));
+
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+
+ i+=ResPacketSize*3;
+ }
+ if(i<n2)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0));
+
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ i+=ResPacketSize*2;
+ }
+ if(i<n1)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ i+=ResPacketSize;
+ }
+ if(HasHalf && i<n_half)
+ {
+ ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
+ c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
+ }
+ pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
+ i+=ResPacketSizeHalf;
+ }
+ if(HasQuarter && i<n_quarter)
+ {
+ ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
+ c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
+ }
+ pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
+ i+=ResPacketSizeQuarter;
+ }
+ for(;i<rows;++i)
+ {
+ ResScalar c0(0);
+ for(Index j=j2; j<jend; j+=1)
+ c0 += cj.pmul(lhs(i,j), rhs(j,0));
+ res[i] += alpha*c0;
+ }
+ }
+}
+
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at once that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ * - alpha is always a complex (or converted to a complex)
+ * - no vectorization
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+ typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
+
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+ Index rows, Index cols,
+ const LhsMapper& alhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ ResScalar alpha)
+{
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate propoer code.
+ LhsMapper lhs(alhs);
+
+ eigen_internal_assert(rhs.stride()==1);
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+ conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+ conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+
+ // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+ // processing 8 rows at once might be counter productive wrt cache.
+ const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
+ const Index n4 = rows-3;
+ const Index n2 = rows-1;
+
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum { LhsAlignment = Unaligned,
+ ResPacketSize = Traits::ResPacketSize,
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+ LhsPacketSize = Traits::LhsPacketSize,
+ LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+ LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+ };
+
+ Index i=0;
+ for(; i<n8; i+=8)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0)),
+ c4 = pset1<ResPacket>(ResScalar(0)),
+ c5 = pset1<ResPacket>(ResScalar(0)),
+ c6 = pset1<ResPacket>(ResScalar(0)),
+ c7 = pset1<ResPacket>(ResScalar(0));
+
+ Index j=0;
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+ {
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+ c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
+ c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
+ c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
+ c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
+ }
+ ResScalar cc0 = predux(c0);
+ ResScalar cc1 = predux(c1);
+ ResScalar cc2 = predux(c2);
+ ResScalar cc3 = predux(c3);
+ ResScalar cc4 = predux(c4);
+ ResScalar cc5 = predux(c5);
+ ResScalar cc6 = predux(c6);
+ ResScalar cc7 = predux(c7);
+ for(; j<cols; ++j)
+ {
+ RhsScalar b0 = rhs(j,0);
+
+ cc0 += cj.pmul(lhs(i+0,j), b0);
+ cc1 += cj.pmul(lhs(i+1,j), b0);
+ cc2 += cj.pmul(lhs(i+2,j), b0);
+ cc3 += cj.pmul(lhs(i+3,j), b0);
+ cc4 += cj.pmul(lhs(i+4,j), b0);
+ cc5 += cj.pmul(lhs(i+5,j), b0);
+ cc6 += cj.pmul(lhs(i+6,j), b0);
+ cc7 += cj.pmul(lhs(i+7,j), b0);
+ }
+ res[(i+0)*resIncr] += alpha*cc0;
+ res[(i+1)*resIncr] += alpha*cc1;
+ res[(i+2)*resIncr] += alpha*cc2;
+ res[(i+3)*resIncr] += alpha*cc3;
+ res[(i+4)*resIncr] += alpha*cc4;
+ res[(i+5)*resIncr] += alpha*cc5;
+ res[(i+6)*resIncr] += alpha*cc6;
+ res[(i+7)*resIncr] += alpha*cc7;
+ }
+ for(; i<n4; i+=4)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0));
+
+ Index j=0;
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+ {
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+ }
+ ResScalar cc0 = predux(c0);
+ ResScalar cc1 = predux(c1);
+ ResScalar cc2 = predux(c2);
+ ResScalar cc3 = predux(c3);
+ for(; j<cols; ++j)
+ {
+ RhsScalar b0 = rhs(j,0);
+
+ cc0 += cj.pmul(lhs(i+0,j), b0);
+ cc1 += cj.pmul(lhs(i+1,j), b0);
+ cc2 += cj.pmul(lhs(i+2,j), b0);
+ cc3 += cj.pmul(lhs(i+3,j), b0);
+ }
+ res[(i+0)*resIncr] += alpha*cc0;
+ res[(i+1)*resIncr] += alpha*cc1;
+ res[(i+2)*resIncr] += alpha*cc2;
+ res[(i+3)*resIncr] += alpha*cc3;
+ }
+ for(; i<n2; i+=2)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0));
+
+ Index j=0;
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+ {
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+ }
+ ResScalar cc0 = predux(c0);
+ ResScalar cc1 = predux(c1);
+ for(; j<cols; ++j)
+ {
+ RhsScalar b0 = rhs(j,0);
+
+ cc0 += cj.pmul(lhs(i+0,j), b0);
+ cc1 += cj.pmul(lhs(i+1,j), b0);
+ }
+ res[(i+0)*resIncr] += alpha*cc0;
+ res[(i+1)*resIncr] += alpha*cc1;
+ }
+ for(; i<rows; ++i)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+ ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+ ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+ Index j=0;
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+ {
+ RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
+ }
+ ResScalar cc0 = predux(c0);
+ if (HasHalf) {
+ for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+ {
+ RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
+ c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
+ }
+ cc0 += predux(c0_h);
+ }
+ if (HasQuarter) {
+ for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+ {
+ RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
+ c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
+ }
+ cc0 += predux(c0_q);
+ }
+ for(; j<cols; ++j)
+ {
+ cc0 += cj.pmul(lhs(i,j), rhs(j,0));
+ }
+ res[i*resIncr] += alpha*cc0;
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
new file mode 100644
index 000000000..6e36c2b3c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
+
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+ Index rows, Index cols, \
+ const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+ const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
+ Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+ if (ConjugateLhs) { \
+ general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+ rows, cols, lhs, rhs, res, resIncr, alpha); \
+ } else { \
+ general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+ rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+ } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+ Index rows, Index cols, \
+ const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+ const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
+ Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+ general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+ rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* rhs, Index rhsIncr, \
+ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+ BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
+ lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
+ const EIGTYPE beta(1); \
+ const EIGTYPE *x_ptr; \
+ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+ if (LhsStorageOrder==RowMajor) { \
+ m = convert_index<BlasIndex>(cols); \
+ n = convert_index<BlasIndex>(rows); \
+ }\
+ GEMVVector x_tmp; \
+ if (ConjugateRhs) { \
+ Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+ x_tmp=map_x.conjugate(); \
+ x_ptr=x_tmp.data(); \
+ incx=1; \
+ } else x_ptr=rhs; \
+ BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 000000000..8f91879e4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+ static int m_maxThreads = -1;
+ EIGEN_UNUSED_VARIABLE(m_maxThreads)
+
+ if(action==SetAction)
+ {
+ eigen_internal_assert(v!=0);
+ m_maxThreads = *v;
+ }
+ else if(action==GetAction)
+ {
+ eigen_internal_assert(v!=0);
+ #ifdef EIGEN_HAS_OPENMP
+ if(m_maxThreads>0)
+ *v = m_maxThreads;
+ else
+ *v = omp_get_max_threads();
+ #else
+ *v = 1;
+ #endif
+ }
+ else
+ {
+ eigen_internal_assert(false);
+ }
+}
+
+}
+
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+ int nbt;
+ internal::manage_multi_threading(GetAction, &nbt);
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+}
+
+/** \returns the max number of threads reserved for Eigen
+ * \sa setNbThreads */
+inline int nbThreads()
+{
+ int ret;
+ internal::manage_multi_threading(GetAction, &ret);
+ return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+ * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+ internal::manage_multi_threading(SetAction, &v);
+}
+
+namespace internal {
+
+template<typename Index> struct GemmParallelInfo
+{
+ GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+
+ // volatile is not enough on all architectures (see bug 1572)
+ // to guarantee that when thread A says to thread B that it is
+ // done with packing a block, then all writes have been really
+ // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+ std::atomic<Index> sync;
+ std::atomic<int> users;
+#else
+ Index volatile sync;
+ int volatile users;
+#endif
+
+ Index lhs_start;
+ Index lhs_length;
+};
+
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
+{
+ // TODO when EIGEN_USE_BLAS is defined,
+ // we should still enable OMP for other scalar types
+ // Without C++11, we have to disable GEMM's parallelization on
+ // non x86 architectures because there volatile is not enough for our purpose.
+ // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
+ // FIXME the transpose variable is only needed to properly split
+ // the matrix product when multithreading is enabled. This is a temporary
+ // fix to support row-major destination matrices. This whole
+ // parallelizer mechanism has to be redesigned anyway.
+ EIGEN_UNUSED_VARIABLE(depth);
+ EIGEN_UNUSED_VARIABLE(transpose);
+ func(0,rows, 0,cols);
+#else
+
+ // Dynamically check whether we should enable or disable OpenMP.
+ // The conditions are:
+ // - the max number of threads we can create is greater than 1
+ // - we are not already in a parallel code
+ // - the sizes are large enough
+
+ // compute the maximal number of threads from the size of the product:
+ // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
+ Index size = transpose ? rows : cols;
+ Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
+
+ // compute the maximal number of threads from the total amount of work:
+ double work = static_cast<double>(rows) * static_cast<double>(cols) *
+ static_cast<double>(depth);
+ double kMinTaskSize = 50000; // FIXME improve this heuristic.
+ pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
+
+ // compute the number of threads we are going to use
+ Index threads = std::min<Index>(nbThreads(), pb_max_threads);
+
+ // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
+ // then abort multi-threading
+ // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+ if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
+ return func(0,rows, 0,cols);
+
+ Eigen::initParallel();
+ func.initParallelSession(threads);
+
+ if(transpose)
+ std::swap(rows,cols);
+
+ ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
+
+ #pragma omp parallel num_threads(threads)
+ {
+ Index i = omp_get_thread_num();
+ // Note that the actual number of threads might be lower than the number of request ones.
+ Index actual_threads = omp_get_num_threads();
+
+ Index blockCols = (cols / actual_threads) & ~Index(0x3);
+ Index blockRows = (rows / actual_threads);
+ blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+ Index r0 = i*blockRows;
+ Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+
+ Index c0 = i*blockCols;
+ Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
+
+ info[i].lhs_start = r0;
+ info[i].lhs_length = actualBlockRows;
+
+ if(transpose) func(c0, actualBlockCols, 0, rows, info);
+ else func(0, rows, c0, actualBlockCols, info);
+ }
+#endif
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 000000000..33ecf10f6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// pack a selfadjoint block diagonal for use with the gebp_kernel
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs
+{
+ template<int BlockRows> inline
+ void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
+ {
+ // normal copy
+ for(Index k=0; k<i; k++)
+ for(Index w=0; w<BlockRows; w++)
+ blockA[count++] = lhs(i+w,k); // normal
+ // symmetric copy
+ Index h = 0;
+ for(Index k=i; k<i+BlockRows; k++)
+ {
+ for(Index w=0; w<h; w++)
+ blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+
+ blockA[count++] = numext::real(lhs(k,k)); // real (diagonal)
+
+ for(Index w=h+1; w<BlockRows; w++)
+ blockA[count++] = lhs(i+w, k); // normal
+ ++h;
+ }
+ // transposed copy
+ for(Index k=i+BlockRows; k<cols; k++)
+ for(Index w=0; w<BlockRows; w++)
+ blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+ }
+ void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+ {
+ typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+ enum { PacketSize = packet_traits<Scalar>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+ const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+ Index count = 0;
+ //Index peeled_mc3 = (rows/Pack1)*Pack1;
+
+ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+
+ if(Pack1>=3*PacketSize)
+ for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+ pack<3*PacketSize>(blockA, lhs, cols, i, count);
+
+ if(Pack1>=2*PacketSize)
+ for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+ pack<2*PacketSize>(blockA, lhs, cols, i, count);
+
+ if(Pack1>=1*PacketSize)
+ for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+ pack<1*PacketSize>(blockA, lhs, cols, i, count);
+
+ if(HasHalf && Pack1>=HalfPacketSize)
+ for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+ pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+ if(HasQuarter && Pack1>=QuarterPacketSize)
+ for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+ pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
+ // do the same with mr==1
+ for(Index i=peeled_mc_quarter; i<rows; i++)
+ {
+ for(Index k=0; k<i; k++)
+ blockA[count++] = lhs(i, k); // normal
+
+ blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
+
+ for(Index k=i+1; k<cols; k++)
+ blockA[count++] = numext::conj(lhs(k, i)); // transposed
+ }
+ }
+};
+
+template<typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs
+{
+ enum { PacketSize = packet_traits<Scalar>::size };
+ void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+ {
+ Index end_k = k2 + rows;
+ Index count = 0;
+ const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+ // first part: normal case
+ for(Index j2=0; j2<k2; j2+=nr)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ if (nr>=4)
+ {
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ }
+ if (nr>=8)
+ {
+ blockB[count+4] = rhs(k,j2+4);
+ blockB[count+5] = rhs(k,j2+5);
+ blockB[count+6] = rhs(k,j2+6);
+ blockB[count+7] = rhs(k,j2+7);
+ }
+ count += nr;
+ }
+ }
+
+ // second part: diagonal block
+ Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+ if(nr>=8)
+ {
+ for(Index j2=k2; j2<end8; j2+=8)
+ {
+ // again we can split vertically in three different parts (transpose, symmetric, normal)
+ // transpose
+ for(Index k=k2; k<j2; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ blockB[count+4] = numext::conj(rhs(j2+4,k));
+ blockB[count+5] = numext::conj(rhs(j2+5,k));
+ blockB[count+6] = numext::conj(rhs(j2+6,k));
+ blockB[count+7] = numext::conj(rhs(j2+7,k));
+ count += 8;
+ }
+ // symmetric
+ Index h = 0;
+ for(Index k=j2; k<j2+8; k++)
+ {
+ // normal
+ for (Index w=0 ; w<h; ++w)
+ blockB[count+w] = rhs(k,j2+w);
+
+ blockB[count+h] = numext::real(rhs(k,k));
+
+ // transpose
+ for (Index w=h+1 ; w<8; ++w)
+ blockB[count+w] = numext::conj(rhs(j2+w,k));
+ count += 8;
+ ++h;
+ }
+ // normal
+ for(Index k=j2+8; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ blockB[count+4] = rhs(k,j2+4);
+ blockB[count+5] = rhs(k,j2+5);
+ blockB[count+6] = rhs(k,j2+6);
+ blockB[count+7] = rhs(k,j2+7);
+ count += 8;
+ }
+ }
+ }
+ if(nr>=4)
+ {
+ for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+ {
+ // again we can split vertically in three different parts (transpose, symmetric, normal)
+ // transpose
+ for(Index k=k2; k<j2; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ count += 4;
+ }
+ // symmetric
+ Index h = 0;
+ for(Index k=j2; k<j2+4; k++)
+ {
+ // normal
+ for (Index w=0 ; w<h; ++w)
+ blockB[count+w] = rhs(k,j2+w);
+
+ blockB[count+h] = numext::real(rhs(k,k));
+
+ // transpose
+ for (Index w=h+1 ; w<4; ++w)
+ blockB[count+w] = numext::conj(rhs(j2+w,k));
+ count += 4;
+ ++h;
+ }
+ // normal
+ for(Index k=j2+4; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ count += 4;
+ }
+ }
+ }
+
+ // third part: transposed
+ if(nr>=8)
+ {
+ for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ blockB[count+4] = numext::conj(rhs(j2+4,k));
+ blockB[count+5] = numext::conj(rhs(j2+5,k));
+ blockB[count+6] = numext::conj(rhs(j2+6,k));
+ blockB[count+7] = numext::conj(rhs(j2+7,k));
+ count += 8;
+ }
+ }
+ }
+ if(nr>=4)
+ {
+ for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ count += 4;
+ }
+ }
+ }
+
+ // copy the remaining columns one at a time (=> the same with nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ // transpose
+ Index half = (std::min)(end_k,j2);
+ for(Index k=k2; k<half; k++)
+ {
+ blockB[count] = numext::conj(rhs(j2,k));
+ count += 1;
+ }
+
+ if(half==j2 && half<k2+rows)
+ {
+ blockB[count] = numext::real(rhs(j2,j2));
+ count += 1;
+ }
+ else
+ half--;
+
+ // normal
+ for(Index k=half+1; k<k2+rows; k++)
+ {
+ blockB[count] = rhs(k,j2);
+ count += 1;
+ }
+ }
+ }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+ int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+ int ResStorageOrder, int ResInnerStride>
+struct product_selfadjoint_matrix;
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+ int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+ int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
+{
+
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ product_selfadjoint_matrix<Scalar, Index,
+ EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+ RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
+ EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+ LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
+ ColMajor,ResInnerStride>
+ ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
+ }
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+ static EIGEN_DONT_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index size = rows;
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+ // kc must be smaller than mc
+ kc = (std::min)(kc,mc);
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+
+ for(Index k2=0; k2<size; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+ // we have selected one row panel of rhs and one column panel of lhs
+ // pack rhs's panel into a sequential chunk of memory
+ // and expand each coeff to a constant packet for further reuse
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
+
+ // the select lhs's panel has to be split in three different parts:
+ // 1 - the transposed panel above the diagonal block => transposed packed copy
+ // 2 - the diagonal block => special packed copy
+ // 3 - the panel below the diagonal block => generic packed copy
+ for(Index i2=0; i2<k2; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,k2)-i2;
+ // transposed packed copy
+ pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ // the block diagonal
+ {
+ const Index actual_mc = (std::min)(k2+kc,size)-k2;
+ // symmetric packed copy
+ pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+
+ for(Index i2=k2+kc; i2<size; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,size)-i2;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ }
+ }
+
+// matrix * selfadjoint product
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+ static EIGEN_DONT_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index size = cols;
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ ResMapper res(_res,resStride, resIncr);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+ symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+ for(Index k2=0; k2<size; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+ pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+
+ // => GEPP
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ }
+ }
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_matrix
+***************************************************************************/
+
+namespace internal {
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
+{
+ typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+
+ enum {
+ LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
+ LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
+ RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
+ RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+ };
+
+ template<typename Dest>
+ static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+ {
+ eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+ * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+ typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+ Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+ BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
+
+ internal::product_selfadjoint_matrix<Scalar, Index,
+ EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+ NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
+ EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+ NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
+ internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor,
+ Dest::InnerStrideAtCompileTime>
+ ::run(
+ lhs.rows(), rhs.cols(), // sizes
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
+ &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
+ actualAlpha, blocking // alpha
+ );
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..61396dbdf
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resIncr, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ char side='L', uplo='L'; \
+ BlasIndex m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ EIGTYPE beta(1); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ ldb = convert_index<BlasIndex>(rhsStride); \
+ ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+ if (LhsStorageOrder==RowMajor) uplo='U'; \
+ a = _lhs; \
+\
+ if (RhsStorageOrder==RowMajor) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.adjoint(); \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+ } else b = _rhs; \
+\
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+ } \
+};
+
+
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resIncr, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ char side='L', uplo='L'; \
+ BlasIndex m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ EIGTYPE beta(1); \
+ MatrixX##EIGPREFIX b_tmp; \
+ Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ ldb = convert_index<BlasIndex>(rhsStride); \
+ ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+ a_tmp = lhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else a = _lhs; \
+ if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+ if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+ b = _rhs; } \
+ else { \
+ if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.conjugate(); \
+ } else \
+ if (ConjugateRhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.adjoint(); \
+ } else { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.transpose(); \
+ } \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+ } \
+\
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resIncr, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ char side='R', uplo='L'; \
+ BlasIndex m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ EIGTYPE beta(1); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set m, n, k */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = convert_index<BlasIndex>(rhsStride); \
+ ldb = convert_index<BlasIndex>(lhsStride); \
+ ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+ if (RhsStorageOrder==RowMajor) uplo='U'; \
+ a = _rhs; \
+\
+ if (LhsStorageOrder==RowMajor) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = lhs.adjoint(); \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+ } else b = _lhs; \
+\
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+ } \
+};
+
+
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resIncr, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ char side='R', uplo='L'; \
+ BlasIndex m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ EIGTYPE beta(1); \
+ MatrixX##EIGPREFIX b_tmp; \
+ Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+\
+/* Set m, n, k */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = convert_index<BlasIndex>(rhsStride); \
+ ldb = convert_index<BlasIndex>(lhsStride); \
+ ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+ a_tmp = rhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else a = _rhs; \
+ if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+ if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+ b = _lhs; } \
+ else { \
+ if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.conjugate(); \
+ } else \
+ if (ConjugateLhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.adjoint(); \
+ } else { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.transpose(); \
+ } \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+ } \
+\
+ BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
new file mode 100644
index 000000000..d38fd72b2
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at once that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+struct selfadjoint_matrix_vector_product;
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+struct selfadjoint_matrix_vector_product
+
+{
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
+ Index size,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs,
+ Scalar* res,
+ Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+ Index size,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs,
+ Scalar* res,
+ Scalar alpha)
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+ enum {
+ IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+ IsLower = UpLo == Lower ? 1 : 0,
+ FirstTriangular = IsRowMajor == IsLower
+ };
+
+ conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
+ conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+ conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
+
+ conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
+ conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+
+ Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
+
+ Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
+ if (FirstTriangular)
+ bound = size - bound;
+
+ for (Index j=FirstTriangular ? bound : 0;
+ j<(FirstTriangular ? size : bound);j+=2)
+ {
+ const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+ const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+ Scalar t0 = cjAlpha * rhs[j];
+ Packet ptmp0 = pset1<Packet>(t0);
+ Scalar t1 = cjAlpha * rhs[j+1];
+ Packet ptmp1 = pset1<Packet>(t1);
+
+ Scalar t2(0);
+ Packet ptmp2 = pset1<Packet>(t2);
+ Scalar t3(0);
+ Packet ptmp3 = pset1<Packet>(t3);
+
+ Index starti = FirstTriangular ? 0 : j+2;
+ Index endi = FirstTriangular ? j : size;
+ Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
+ Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+ res[j] += cjd.pmul(numext::real(A0[j]), t0);
+ res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
+ if(FirstTriangular)
+ {
+ res[j] += cj0.pmul(A1[j], t1);
+ t3 += cj1.pmul(A1[j], rhs[j]);
+ }
+ else
+ {
+ res[j+1] += cj0.pmul(A0[j+1],t0);
+ t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+ }
+
+ for (Index i=starti; i<alignedStart; ++i)
+ {
+ res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ t3 += cj1.pmul(A1[i], rhs[i]);
+ }
+ // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
+ // gcc 4.2 does this optimization automatically.
+ const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+ const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+ const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+ Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+ for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+ {
+ Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
+ Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
+ Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+ Packet Xi = pload <Packet>(resIt);
+
+ Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+ ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+ ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+ pstore(resIt,Xi); resIt += PacketSize;
+ }
+ for (Index i=alignedEnd; i<endi; i++)
+ {
+ res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ t3 += cj1.pmul(A1[i], rhs[i]);
+ }
+
+ res[j] += alpha * (t2 + predux(ptmp2));
+ res[j+1] += alpha * (t3 + predux(ptmp3));
+ }
+ for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+ {
+ const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+ Scalar t1 = cjAlpha * rhs[j];
+ Scalar t2(0);
+ res[j] += cjd.pmul(numext::real(A0[j]), t1);
+ for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
+ {
+ res[i] += cj0.pmul(A0[i], t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ }
+ res[j] += alpha * t2;
+ }
+}
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_vector
+***************************************************************************/
+
+namespace internal {
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
+{
+ typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+ typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+ enum { LhsUpLo = LhsMode&(Upper|Lower) };
+
+ template<typename Dest>
+ static EIGEN_DEVICE_FUNC
+ void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+ {
+ typedef typename Dest::Scalar ResScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+ eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+ * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+ enum {
+ EvalToDest = (Dest::InnerStrideAtCompileTime==1),
+ UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
+ };
+
+ internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
+ internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+
+ ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+ EvalToDest ? dest.data() : static_dest.data());
+
+ ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
+ UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+
+ if(!EvalToDest)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ Index size = dest.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ MappedDest(actualDestPtr, dest.size()) = dest;
+ }
+
+ if(!UseRhs)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ Index size = rhs.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+ }
+
+
+ internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+ (
+ lhs.rows(), // size
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ actualRhsPtr, // rhs info
+ actualDestPtr, // result info
+ actualAlpha // scale factor
+ );
+
+ if(!EvalToDest)
+ dest = MappedDest(actualDestPtr, dest.size());
+ }
+};
+
+template<typename Lhs, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
+{
+ typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+ enum { RhsUpLo = RhsMode&(Upper|Lower) };
+
+ template<typename Dest>
+ static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+ {
+ // let's simply transpose the product
+ Transpose<Dest> destT(dest);
+ selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+ Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
new file mode 100644
index 000000000..1238345e3
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+ Index size, const Scalar* lhs, Index lhsStride, \
+ const Scalar* _rhs, Scalar* res, Scalar alpha) { \
+ enum {\
+ IsColMajor = StorageOrder==ColMajor \
+ }; \
+ if (IsColMajor == ConjugateLhs) {\
+ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+ size, lhs, lhsStride, _rhs, res, alpha); \
+ } else {\
+ selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+ size, lhs, lhsStride, _rhs, res, alpha); \
+ }\
+ } \
+}; \
+
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE* lhs, Index lhsStride, \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+ enum {\
+ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+ IsLower = UpLo == Lower ? 1 : 0 \
+ }; \
+ BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \
+ EIGTYPE beta(1); \
+ const EIGTYPE *x_ptr; \
+ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+ SYMVVector x_tmp; \
+ if (ConjugateRhs) { \
+ Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
+ x_tmp=map_x.conjugate(); \
+ x_ptr=x_tmp.data(); \
+ } else x_ptr=_rhs; \
+ BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
+#else
+EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h
new file mode 100644
index 000000000..a21be8050
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h
@@ -0,0 +1,133 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PRODUCT_H
+#define EIGEN_SELFADJOINT_PRODUCT_H
+
+/**********************************************************************
+* This file implements a self adjoint product: C += A A^T updating only
+* half of the selfadjoint matrix C.
+* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+**********************************************************************/
+
+namespace Eigen {
+
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+ static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+ {
+ internal::conj_if<ConjRhs> cj;
+ typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+ typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
+ for (Index i=0; i<size; ++i)
+ {
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
+ += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+ }
+ }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+ static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+ {
+ selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+ }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+struct selfadjoint_product_selector;
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
+{
+ static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef internal::blas_traits<OtherType> OtherBlasTraits;
+ typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+ typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+ typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+ Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+ enum {
+ StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+ };
+ internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
+ (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+
+ if(!UseOtherDirectly)
+ Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+
+ selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+ OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
+ ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+ }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
+{
+ static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef internal::blas_traits<OtherType> OtherBlasTraits;
+ typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+ typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+ typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+ Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+ enum {
+ IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+ OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+ };
+
+ Index size = mat.cols();
+ Index depth = actualOther.cols();
+
+ typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+ MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+ BlockingType blocking(size, size, depth, 1, false);
+
+
+ internal::general_matrix_matrix_triangular_product<Index,
+ Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+ IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
+ ::run(size, depth,
+ actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(),
+ mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
+ }
+};
+
+// high level API
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+ selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+
+ return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
new file mode 100644
index 000000000..f752a0bf0
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
+#define EIGEN_SELFADJOINTRANK2UPTADE_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * It corresponds to the Level2 syr2 BLAS routine
+ */
+
+template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+struct selfadjoint_rank2_update_selector;
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
+{
+ static EIGEN_DEVICE_FUNC
+ void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+ {
+ const Index size = u.size();
+ for (Index i=0; i<size; ++i)
+ {
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
+ (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
+ + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+ }
+ }
+};
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
+{
+ static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+ {
+ const Index size = u.size();
+ for (Index i=0; i<size; ++i)
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
+ (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i+1)
+ + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+ }
+};
+
+template<bool Cond, typename T> struct conj_expr_if
+ : conditional<!Cond, const T&,
+ CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+
+} // end namespace internal
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU, typename DerivedV>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
+{
+ typedef internal::blas_traits<DerivedU> UBlasTraits;
+ typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
+ typedef typename internal::remove_all<ActualUType>::type _ActualUType;
+ typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+
+ typedef internal::blas_traits<DerivedV> VBlasTraits;
+ typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
+ typedef typename internal::remove_all<ActualVType>::type _ActualVType;
+ typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+
+ // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
+ // vice versa, and take the complex conjugate of all coefficients and vector entries.
+
+ enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+ Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
+ * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+ if (IsRowMajor)
+ actualAlpha = numext::conj(actualAlpha);
+
+ typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
+ typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
+ internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
+ (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
+ ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
+
+ return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
new file mode 100644
index 000000000..f0c60507a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -0,0 +1,472 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
+// struct gemm_pack_lhs_triangular
+// {
+// Matrix<Scalar,mr,mr,
+// void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+// {
+// conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+// const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+// int count = 0;
+// const int peeled_mc = (rows/mr)*mr;
+// for(int i=0; i<peeled_mc; i+=mr)
+// {
+// for(int k=0; k<depth; k++)
+// for(int w=0; w<mr; w++)
+// blockA[count++] = cj(lhs(i+w, k));
+// }
+// for(int i=peeled_mc; i<rows; i++)
+// {
+// for(int k=0; k<depth; k++)
+// blockA[count++] = cj(lhs(i, k));
+// }
+// }
+// };
+
+/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder, int ResInnerStride,
+ int Version = Specialized>
+struct product_triangular_matrix_matrix;
+
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
+{
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols, Index depth,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ product_triangular_matrix_matrix<Scalar, Index,
+ (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
+ (!LhsIsTriangular),
+ RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+ ConjugateRhs,
+ LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+ ConjugateLhs,
+ ColMajor, ResInnerStride>
+ ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
+ }
+};
+
+// implements col-major += alpha * op(triangular) * op(general)
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
+{
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower,
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+ };
+
+ static EIGEN_DONT_INLINE void run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ // strip zeros
+ Index diagSize = (std::min)(_rows,_depth);
+ Index rows = IsLower ? _rows : diagSize;
+ Index depth = IsLower ? diagSize : _depth;
+ Index cols = _cols;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+ // The small panel size must not be larger than blocking size.
+ // Usually this should never be the case because SmallPanelWidth^2 is very small
+ // compared to L2 cache size, but let's be safe:
+ Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ // To work around an "error: member reference base type 'Matrix<...>
+ // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
+ // not a structure or union" compilation error in nvcc (tested V8.0.61),
+ // create a dummy internal::constructor_without_unaligned_array_assert
+ // object to pass to the Matrix constructor.
+ internal::constructor_without_unaligned_array_assert a;
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
+ triangularBuffer.setZero();
+ if((Mode&ZeroDiag)==ZeroDiag)
+ triangularBuffer.diagonal().setZero();
+ else
+ triangularBuffer.diagonal().setOnes();
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+
+ for(Index k2=IsLower ? depth : 0;
+ IsLower ? k2>0 : k2<depth;
+ IsLower ? k2-=kc : k2+=kc)
+ {
+ Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
+ Index actual_k2 = IsLower ? k2-actual_kc : k2;
+
+ // align blocks with the end of the triangular part for trapezoidal lhs
+ if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
+ {
+ actual_kc = rows-k2;
+ k2 = k2+actual_kc-kc;
+ }
+
+ pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
+
+ // the selected lhs's panel has to be split in three different parts:
+ // 1 - the part which is zero => skip it
+ // 2 - the diagonal block => special kernel
+ // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+ // the block diagonal, if any:
+ if(IsLower || actual_k2<rows)
+ {
+ // for each small vertical panels of lhs
+ for (Index k1=0; k1<actual_kc; k1+=panelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
+ Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
+ Index startBlock = actual_k2+k1;
+ Index blockBOffset = k1;
+
+ // => GEBP with the micro triangular block
+ // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+ // To this end we do an extra triangular copy to a small temporary buffer
+ for (Index k=0;k<actualPanelWidth;++k)
+ {
+ if (SetDiag)
+ triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
+ for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
+ triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
+ }
+ pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
+
+ gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+ actualPanelWidth, actualPanelWidth, cols, alpha,
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+
+ // GEBP with remaining micro panel
+ if (lengthTarget>0)
+ {
+ Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
+
+ pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+ gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+ lengthTarget, actualPanelWidth, cols, alpha,
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+ }
+ }
+ }
+ // the part below (lower case) or above (upper case) the diagonal => GEPP
+ {
+ Index start = IsLower ? k2 : 0;
+ Index end = IsLower ? rows : (std::min)(actual_k2,rows);
+ for(Index i2=start; i2<end; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,end)-i2;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+ actual_kc, cols, alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+
+// implements col-major += alpha * op(general) * op(triangular)
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
+{
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower,
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+ };
+
+ static EIGEN_DONT_INLINE void run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resIncr, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
+ // strip zeros
+ Index diagSize = (std::min)(_cols,_depth);
+ Index rows = _rows;
+ Index depth = IsLower ? _depth : diagSize;
+ Index cols = IsLower ? diagSize : _cols;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride, resIncr);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ internal::constructor_without_unaligned_array_assert a;
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
+ triangularBuffer.setZero();
+ if((Mode&ZeroDiag)==ZeroDiag)
+ triangularBuffer.diagonal().setZero();
+ else
+ triangularBuffer.diagonal().setOnes();
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+
+ for(Index k2=IsLower ? 0 : depth;
+ IsLower ? k2<depth : k2>0;
+ IsLower ? k2+=kc : k2-=kc)
+ {
+ Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
+ Index actual_k2 = IsLower ? k2 : k2-actual_kc;
+
+ // align blocks with the end of the triangular part for trapezoidal rhs
+ if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
+ {
+ actual_kc = cols-k2;
+ k2 = actual_k2 + actual_kc - kc;
+ }
+
+ // remaining size
+ Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
+ // size of the triangular part
+ Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
+
+ Scalar* geb = blockB+ts*ts;
+ geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
+
+ pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
+
+ // pack the triangular part of the rhs padding the unrolled blocks with zeros
+ if(ts>0)
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index actual_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+ // general part
+ pack_rhs_panel(blockB+j2*actual_kc,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+ panelLength, actualPanelWidth,
+ actual_kc, panelOffset);
+
+ // append the triangular part via a temporary buffer
+ for (Index j=0;j<actualPanelWidth;++j)
+ {
+ if (SetDiag)
+ triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
+ for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
+ triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
+ }
+
+ pack_rhs_panel(blockB+j2*actual_kc,
+ RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
+ actualPanelWidth, actualPanelWidth,
+ actual_kc, j2);
+ }
+ }
+
+ for (Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,rows-i2);
+ pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+ // triangular kernel
+ if(ts>0)
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
+ Index blockOffset = IsLower ? j2 : 0;
+
+ gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
+ blockA, blockB+j2*actual_kc,
+ actual_mc, panelLength, actualPanelWidth,
+ alpha,
+ actual_kc, actual_kc, // strides
+ blockOffset, blockOffset);// offsets
+ }
+ }
+ gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
+ blockA, geb, actual_mc, actual_kc, rs,
+ alpha,
+ -1, -1, 0, 0);
+ }
+ }
+ }
+
+/***************************************************************************
+* Wrapper to product_triangular_matrix_matrix
+***************************************************************************/
+
+} // end namespace internal
+
+namespace internal {
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
+{
+ template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
+ {
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef typename Dest::Scalar Scalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+ typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+ LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
+ RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
+ Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+ typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+ Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
+
+ enum { IsLower = (Mode&Lower) == Lower };
+ Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
+ Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
+ Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
+ : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
+
+ BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
+
+ internal::product_triangular_matrix_matrix<Scalar, Index,
+ Mode, LhsIsTriangular,
+ (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+ (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+ (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
+ ::run(
+ stripedRows, stripedCols, stripedDepth, // sizes
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
+ &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info
+ actualAlpha, blocking
+ );
+
+ // Apply correction if the diagonal is unit and a scalar factor was nested:
+ if ((Mode&UnitDiag)==UnitDiag)
+ {
+ if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
+ {
+ Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+ dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
+ }
+ else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
+ {
+ Index diagSize = (std::min)(rhs.rows(),rhs.cols());
+ dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
+ }
+ }
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..a98d12e4a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -0,0 +1,317 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+ product_triangular_matrix_matrix<Scalar,Index,Mode,
+ LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
+ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+ const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+ EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+ eigen_assert(resIncr == 1); \
+ product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+ LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+ RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+ } \
+};
+
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+ LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+ }; \
+\
+ static void run( \
+ Index _rows, Index _cols, Index _depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+ { \
+ Index diagSize = (std::min)(_rows,_depth); \
+ Index rows = IsLower ? _rows : diagSize; \
+ Index depth = IsLower ? diagSize : _depth; \
+ Index cols = _cols; \
+\
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+ if (rows != depth) { \
+\
+ /* FIXME handle mkl_domain_get_max_threads */ \
+ /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\
+\
+ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+ /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
+ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+ } else { \
+ /* Make sense to call GEMM */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+ } \
+ return; \
+ } \
+ char side = 'L', transa, uplo, diag = 'N'; \
+ EIGTYPE *b; \
+ const EIGTYPE *a; \
+ BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+ m = convert_index<BlasIndex>(diagSize); \
+ n = convert_index<BlasIndex>(cols); \
+\
+/* Set trans */ \
+ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+ if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixLhs a_tmp; \
+\
+ if ((conjA!=0) || (SetDiag==0)) { \
+ if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+ if (IsZeroDiag) \
+ a_tmp.diagonal().setZero(); \
+ else if (IsUnitDiag) \
+ a_tmp.diagonal().setOnes();\
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else { \
+ a = _lhs; \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ } \
+ /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+ res_tmp=res_tmp+b_tmp; \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+ LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+ }; \
+\
+ static void run( \
+ Index _rows, Index _cols, Index _depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+ { \
+ Index diagSize = (std::min)(_cols,_depth); \
+ Index rows = _rows; \
+ Index depth = IsLower ? _depth : diagSize; \
+ Index cols = IsLower ? diagSize : _cols; \
+\
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+ if (cols != depth) { \
+\
+ int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \
+\
+ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+ /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
+ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+ } else { \
+ /* Make sense to call GEMM */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+ } \
+ return; \
+ } \
+ char side = 'R', transa, uplo, diag = 'N'; \
+ EIGTYPE *b; \
+ const EIGTYPE *a; \
+ BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+ m = convert_index<BlasIndex>(rows); \
+ n = convert_index<BlasIndex>(diagSize); \
+\
+/* Set trans */ \
+ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+ if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+ b = b_tmp.data(); \
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+ MatrixRhs a_tmp; \
+\
+ if ((conjA!=0) || (SetDiag==0)) { \
+ if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+ if (IsZeroDiag) \
+ a_tmp.diagonal().setZero(); \
+ else if (IsUnitDiag) \
+ a_tmp.diagonal().setOnes();\
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else { \
+ a = _rhs; \
+ lda = convert_index<BlasIndex>(rhsStride); \
+ } \
+ /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+ res_tmp=res_tmp+b_tmp; \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h
new file mode 100644
index 000000000..76bfa159c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -0,0 +1,350 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
+#define EIGEN_TRIANGULARMATRIXVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+struct triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+{
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ enum {
+ IsLower = ((Mode&Lower)==Lower),
+ HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+ HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+ };
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
+ {
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ Index size = (std::min)(_rows,_cols);
+ Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+ Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+ typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+ typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
+ const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+ typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+ typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+ ResMap res(_res,rows);
+
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+ for (Index pi=0; pi<size; pi+=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = pi + k;
+ Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
+ Index r = IsLower ? actualPanelWidth-k : k+1;
+ if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+ res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
+ if (HasUnitDiag)
+ res.coeffRef(i) += alpha * cjRhs.coeff(i);
+ }
+ Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+ if (r>0)
+ {
+ Index s = IsLower ? pi+actualPanelWidth : 0;
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+ r, actualPanelWidth,
+ LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+ RhsMapper(&rhs.coeffRef(pi), rhsIncr),
+ &res.coeffRef(s), resIncr, alpha);
+ }
+ }
+ if((!IsLower) && cols>size)
+ {
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+ rows, cols-size,
+ LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+ RhsMapper(&rhs.coeffRef(size), rhsIncr),
+ _res, resIncr, alpha);
+ }
+ }
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+{
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ enum {
+ IsLower = ((Mode&Lower)==Lower),
+ HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+ HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+ };
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+ {
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ Index diagSize = (std::min)(_rows,_cols);
+ Index rows = IsLower ? _rows : diagSize;
+ Index cols = IsLower ? diagSize : _cols;
+
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+ typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+ typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+ const RhsMap rhs(_rhs,cols);
+ typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+ typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
+ ResMap res(_res,rows,InnerStride<>(resIncr));
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+ for (Index pi=0; pi<diagSize; pi+=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = pi + k;
+ Index s = IsLower ? pi : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
+ Index r = IsLower ? k+1 : actualPanelWidth-k;
+ if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+ res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
+ if (HasUnitDiag)
+ res.coeffRef(i) += alpha * cjRhs.coeff(i);
+ }
+ Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+ if (r>0)
+ {
+ Index s = IsLower ? 0 : pi + actualPanelWidth;
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+ actualPanelWidth, r,
+ LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+ RhsMapper(&rhs.coeffRef(s), rhsIncr),
+ &res.coeffRef(pi), resIncr, alpha);
+ }
+ }
+ if(IsLower && rows>diagSize)
+ {
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+ rows-diagSize, cols,
+ LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+ RhsMapper(&rhs.coeffRef(0), rhsIncr),
+ &res.coeffRef(diagSize), resIncr, alpha);
+ }
+ }
+
+/***************************************************************************
+* Wrapper to product_triangular_vector
+***************************************************************************/
+
+template<int Mode,int StorageOrder>
+struct trmv_selector;
+
+} // end namespace internal
+
+namespace internal {
+
+template<int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true>
+{
+ template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
+ {
+ eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
+
+ internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha);
+ }
+};
+
+template<int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false>
+{
+ template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
+ {
+ eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
+
+ Transpose<Dest> dstT(dst);
+ internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+ (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>
+ ::run(rhs.transpose(),lhs.transpose(), dstT, alpha);
+ }
+};
+
+} // end namespace internal
+
+namespace internal {
+
+// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
+
+template<int Mode> struct trmv_selector<Mode,ColMajor>
+{
+ template<typename Lhs, typename Rhs, typename Dest>
+ static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+ {
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef typename Dest::Scalar ResScalar;
+ typedef typename Dest::RealScalar RealScalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+
+ typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+ LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+ RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+ ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+ enum {
+ // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+ // on, the other hand it is good for the cache to pack the vector anyways...
+ EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+ ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+ MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+ };
+
+ gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+ bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+ bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+ RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+ ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+ evalToDest ? dest.data() : static_dest.data());
+
+ if(!evalToDest)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ Index size = dest.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ if(!alphaIsCompatible)
+ {
+ MappedDest(actualDestPtr, dest.size()).setZero();
+ compatibleAlpha = RhsScalar(1);
+ }
+ else
+ MappedDest(actualDestPtr, dest.size()) = dest;
+ }
+
+ internal::triangular_matrix_vector_product
+ <Index,Mode,
+ LhsScalar, LhsBlasTraits::NeedToConjugate,
+ RhsScalar, RhsBlasTraits::NeedToConjugate,
+ ColMajor>
+ ::run(actualLhs.rows(),actualLhs.cols(),
+ actualLhs.data(),actualLhs.outerStride(),
+ actualRhs.data(),actualRhs.innerStride(),
+ actualDestPtr,1,compatibleAlpha);
+
+ if (!evalToDest)
+ {
+ if(!alphaIsCompatible)
+ dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+ else
+ dest = MappedDest(actualDestPtr, dest.size());
+ }
+
+ if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+ {
+ Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+ dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+ }
+ }
+};
+
+template<int Mode> struct trmv_selector<Mode,RowMajor>
+{
+ template<typename Lhs, typename Rhs, typename Dest>
+ static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+ {
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef typename Dest::Scalar ResScalar;
+
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+ typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+ typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+ typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+ LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+ RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+ ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+ enum {
+ DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+ };
+
+ gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+ ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+ DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+ if(!DirectlyUseRhs)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ Index size = actualRhs.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+ }
+
+ internal::triangular_matrix_vector_product
+ <Index,Mode,
+ LhsScalar, LhsBlasTraits::NeedToConjugate,
+ RhsScalar, RhsBlasTraits::NeedToConjugate,
+ RowMajor>
+ ::run(actualLhs.rows(),actualLhs.cols(),
+ actualLhs.data(),actualLhs.outerStride(),
+ actualRhsPtr,1,
+ dest.data(),dest.innerStride(),
+ actualAlpha);
+
+ if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+ {
+ Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+ dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+ }
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
new file mode 100644
index 000000000..3d47a2b94
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -0,0 +1,255 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+ triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ } \
+};
+
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper \
+ }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+ if (ConjLhs || IsZeroDiag) { \
+ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ return; \
+ }\
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
+\
+ typedef VectorX##EIGPREFIX VectorRhs; \
+ EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+ VectorRhs x_tmp; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+ char trans, uplo, diag; \
+ BlasIndex m, n, lda, incx, incy; \
+ EIGTYPE const *a; \
+ EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+ n = convert_index<BlasIndex>(size); \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ incx = 1; \
+ incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+ trans = 'N'; \
+ uplo = IsLower ? 'L' : 'U'; \
+ diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+ if (size<(std::max)(rows,cols)) { \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+ if (size<rows) { \
+ y = _res + size*resIncr; \
+ a = _lhs + size; \
+ m = convert_index<BlasIndex>(rows-size); \
+ n = convert_index<BlasIndex>(size); \
+ } \
+ else { \
+ x += size; \
+ y = _res; \
+ a = _lhs + size*lda; \
+ m = convert_index<BlasIndex>(size); \
+ n = convert_index<BlasIndex>(cols-size); \
+ } \
+ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+ } \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double, double, d, d,)
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_CM(float, float, f, s,)
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
+#else
+EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
+#endif
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper \
+ }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+ if (IsZeroDiag) { \
+ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ return; \
+ }\
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
+\
+ typedef VectorX##EIGPREFIX VectorRhs; \
+ EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+ VectorRhs x_tmp; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+ char trans, uplo, diag; \
+ BlasIndex m, n, lda, incx, incy; \
+ EIGTYPE const *a; \
+ EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+ n = convert_index<BlasIndex>(size); \
+ lda = convert_index<BlasIndex>(lhsStride); \
+ incx = 1; \
+ incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+ trans = ConjLhs ? 'C' : 'T'; \
+ uplo = IsLower ? 'U' : 'L'; \
+ diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+ BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+ BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+ if (size<(std::max)(rows,cols)) { \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+ if (size<rows) { \
+ y = _res + size*resIncr; \
+ a = _lhs + size*lda; \
+ m = convert_index<BlasIndex>(rows-size); \
+ n = convert_index<BlasIndex>(size); \
+ } \
+ else { \
+ x += size; \
+ y = _res; \
+ a = _lhs + size; \
+ m = convert_index<BlasIndex>(size); \
+ n = convert_index<BlasIndex>(cols-size); \
+ } \
+ BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+ } \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double, double, d, d,)
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_RM(float, float, f, s,)
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
+#else
+EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
+EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
+EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 000000000..6d879ba00
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,337 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// if the rhs is row major, let's transpose the product
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
+{
+ static void run(
+ Index size, Index cols,
+ const Scalar* tri, Index triStride,
+ Scalar* _other, Index otherIncr, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ triangular_solve_matrix<
+ Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
+ (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
+ NumTraits<Scalar>::IsComplex && Conjugate,
+ TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
+ ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
+ }
+};
+
+/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
+{
+ static EIGEN_DONT_INLINE void run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherIncr, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherIncr, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index cols = otherSize;
+
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+ typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
+ TriMapper tri(_tri, triStride);
+ OtherMapper other(_other, otherStride, otherIncr);
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ enum {
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower
+ };
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(size,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ conj_if<Conjugate> conj;
+ gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
+
+ // the goal here is to subdivise the Rhs panels such that we keep some cache
+ // coherence when accessing the rhs elements
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
+ subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
+
+ for(Index k2=IsLower ? 0 : size;
+ IsLower ? k2<size : k2>0;
+ IsLower ? k2+=kc : k2-=kc)
+ {
+ const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
+
+ // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+ // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+ // A11 (the triangular part) and A21 the remaining rectangular part.
+ // Then the high level algorithm is:
+ // - B = R1 => general block copy (done during the next step)
+ // - R1 = A11^-1 B => tricky part
+ // - update B from the new R1 => actually this has to be performed continuously during the above step
+ // - R2 -= A21 * B => GEPP
+
+ // The tricky part: compute R1 = A11^-1 B while updating B from R1
+ // The idea is to split A11 into multiple small vertical panels.
+ // Each panel can be split into a small triangular part T1k which is processed without optimization,
+ // and the remaining small part T2k which is processed using gebp with appropriate block strides
+ for(Index j2=0; j2<cols; j2+=subcols)
+ {
+ Index actual_cols = (std::min)(cols-j2,subcols);
+ // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+ for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+ // tr solve
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ // TODO write a small kernel handling this (can be shared with trsv)
+ Index i = IsLower ? k2+k1+k : k2-k1-k-1;
+ Index rs = actualPanelWidth - k - 1; // remaining size
+ Index s = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+ : IsLower ? i+1 : i-rs;
+
+ Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
+ for (Index j=j2; j<j2+actual_cols; ++j)
+ {
+ if (TriStorageOrder==RowMajor)
+ {
+ Scalar b(0);
+ const Scalar* l = &tri(i,s);
+ typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+ for (Index i3=0; i3<k; ++i3)
+ b += conj(l[i3]) * r(i3);
+
+ other(i,j) = (other(i,j) - b)*a;
+ }
+ else
+ {
+ Scalar& otherij = other(i,j);
+ otherij *= a;
+ Scalar b = otherij;
+ typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+ typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
+ for (Index i3=0;i3<rs;++i3)
+ r(i3) -= b * conj(l(i3));
+ }
+ }
+ }
+
+ Index lengthTarget = actual_kc-k1-actualPanelWidth;
+ Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
+ Index blockBOffset = IsLower ? k1 : lengthTarget;
+
+ // update the respective rows of B from other
+ pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+
+ // GEBP
+ if (lengthTarget>0)
+ {
+ Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+
+ pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+ gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+ }
+ }
+ }
+
+ // R2 -= A21 * B => GEPP
+ {
+ Index start = IsLower ? k2+kc : 0;
+ Index end = IsLower ? size : k2-kc;
+ for(Index i2=start; i2<end; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,end-i2);
+ if (actual_mc>0)
+ {
+ pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
+
+ gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+ }
+
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
+{
+ static EIGEN_DONT_INLINE void run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherIncr, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherIncr, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index rows = otherSize;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+ LhsMapper lhs(_other, otherStride, otherIncr);
+ RhsMapper rhs(_tri, triStride);
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ RhsStorageOrder = TriStorageOrder,
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower
+ };
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*size;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ conj_if<Conjugate> conj;
+ gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
+
+ for(Index k2=IsLower ? size : 0;
+ IsLower ? k2>0 : k2<size;
+ IsLower ? k2-=kc : k2+=kc)
+ {
+ const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
+ Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
+
+ Index startPanel = IsLower ? 0 : k2+actual_kc;
+ Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+ Scalar* geb = blockB+actual_kc*actual_kc;
+
+ if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
+
+ // triangular packing (we only pack the panels off the diagonal,
+ // neglecting the blocks overlapping the diagonal
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index actual_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+
+ if (panelLength>0)
+ pack_rhs_panel(blockB+j2*actual_kc,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+ panelLength, actualPanelWidth,
+ actual_kc, panelOffset);
+ }
+ }
+
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,rows-i2);
+
+ // triangular solver kernel
+ {
+ // for each small block of the diagonal (=> vertical panels of rhs)
+ for (Index j2 = IsLower
+ ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
+ : Index(SmallPanelWidth)))
+ : 0;
+ IsLower ? j2>=0 : j2<actual_kc;
+ IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index absolute_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+ // GEBP
+ if(panelLength>0)
+ {
+ gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
+ blockA, blockB+j2*actual_kc,
+ actual_mc, panelLength, actualPanelWidth,
+ Scalar(-1),
+ actual_kc, actual_kc, // strides
+ panelOffset, panelOffset); // offsets
+ }
+
+ // unblocked triangular solve
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
+
+ typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
+ for (Index k3=0; k3<k; ++k3)
+ {
+ Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
+ typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+ for (Index i=0; i<actual_mc; ++i)
+ r(i) -= a(i) * b;
+ }
+ if((Mode & UnitDiag)==0)
+ {
+ Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
+ for (Index i=0; i<actual_mc; ++i)
+ r(i) *= inv_rjj;
+ }
+ }
+
+ // pack the just computed part of lhs to A
+ pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
+ actualPanelWidth, actual_mc,
+ actual_kc, j2);
+ }
+ }
+
+ if (rs>0)
+ gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
+ actual_mc, actual_kc, rs, Scalar(-1),
+ -1, -1, 0, 0);
+ }
+ }
+ }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
new file mode 100644
index 000000000..621194ce6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to BLAS F77
+ * Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+ }; \
+ static void run( \
+ Index size, Index otherSize, \
+ const EIGTYPE* _tri, Index triStride, \
+ EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+ eigen_assert(otherIncr == 1); \
+ BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
+ char side = 'L', uplo, diag='N', transa; \
+ /* Set alpha_ */ \
+ EIGTYPE alpha(1); \
+ ldb = convert_index<BlasIndex>(otherStride);\
+\
+ const EIGTYPE *a; \
+/* Set trans */ \
+ transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+ Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+ MatrixTri a_tmp; \
+\
+ if (conjA) { \
+ a_tmp = tri.conjugate(); \
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else { \
+ a = _tri; \
+ lda = convert_index<BlasIndex>(triStride); \
+ } \
+ if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double, double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float, float, strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float, float, strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
+#endif
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+ }; \
+ static void run( \
+ Index size, Index otherSize, \
+ const EIGTYPE* _tri, Index triStride, \
+ EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+ { \
+ EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+ eigen_assert(otherIncr == 1); \
+ BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
+ char side = 'R', uplo, diag='N', transa; \
+ /* Set alpha_ */ \
+ EIGTYPE alpha(1); \
+ ldb = convert_index<BlasIndex>(otherStride);\
+\
+ const EIGTYPE *a; \
+/* Set trans */ \
+ transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+ Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+ MatrixTri a_tmp; \
+\
+ if (conjA) { \
+ a_tmp = tri.conjugate(); \
+ a = a_tmp.data(); \
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+ } else { \
+ a = _tri; \
+ lda = convert_index<BlasIndex>(triStride); \
+ } \
+ if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+ BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double, double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float, float, strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float, float, strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h
new file mode 100644
index 000000000..647317016
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h
@@ -0,0 +1,148 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+ ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+ Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+ >::run(size, _lhs, lhsStride, rhs);
+ }
+};
+
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+ enum {
+ IsLower = ((Mode&Lower)==Lower)
+ };
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
+ typename internal::conditional<
+ Conjugate,
+ const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+ const LhsMap&>
+ ::type cjLhs(lhs);
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ for(Index pi=IsLower ? 0 : size;
+ IsLower ? pi<size : pi>0;
+ IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+
+ Index r = IsLower ? pi : size - pi; // remaining size
+ if (r > 0)
+ {
+ // let's directly call the low level product function because:
+ // 1 - it is faster to compile
+ // 2 - it is slightly faster at runtime
+ Index startRow = IsLower ? pi : pi-actualPanelWidth;
+ Index startCol = IsLower ? 0 : pi;
+
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+ actualPanelWidth, r,
+ LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+ RhsMapper(rhs + startCol, 1),
+ rhs + startRow, 1,
+ RhsScalar(-1));
+ }
+
+ for(Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = IsLower ? pi+k : pi-k-1;
+ Index s = IsLower ? pi : i+1;
+ if (k>0)
+ rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
+
+ if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
+ rhs[i] /= cjLhs(i,i);
+ }
+ }
+ }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+ enum {
+ IsLower = ((Mode&Lower)==Lower)
+ };
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+ typename internal::conditional<Conjugate,
+ const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+ const LhsMap&
+ >::type cjLhs(lhs);
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+
+ for(Index pi=IsLower ? 0 : size;
+ IsLower ? pi<size : pi>0;
+ IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+ Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+ Index endBlock = IsLower ? pi + actualPanelWidth : 0;
+
+ for(Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = IsLower ? pi+k : pi-k-1;
+ if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+ {
+ if(!(Mode & UnitDiag))
+ rhs[i] /= cjLhs.coeff(i,i);
+
+ Index r = actualPanelWidth - k - 1; // remaining size
+ Index s = IsLower ? i+1 : i-r;
+ if (r>0)
+ Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+ }
+ }
+ Index r = IsLower ? size - endBlock : startBlock; // remaining size
+ if (r > 0)
+ {
+ // let's directly call the low level product function because:
+ // 1 - it is faster to compile
+ // 2 - it is slightly faster at runtime
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+ r, actualPanelWidth,
+ LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+ RhsMapper(rhs+startBlock, 1),
+ rhs+endBlock, 1, RhsScalar(-1));
+ }
+ }
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H