diff options
Diffstat (limited to 'src/3rdparty/eigen/Eigen/src/Core/products')
21 files changed, 8074 insertions, 0 deletions
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..f35b760c1 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,2645 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + + +namespace Eigen { + +namespace internal { + +enum GEBPPacketSizeType { + GEBPPacketFull = 0, + GEBPPacketHalf, + GEBPPacketQuarter +}; + +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull> +class gebp_traits; + + +/** \internal \returns b if a<=0, and returns a otherwise. */ +inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) +{ + return a<=0 ? b : a; +} + +#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) + +#if EIGEN_ARCH_i386_OR_x86_64 +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); +#elif EIGEN_ARCH_PPC +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); +#else +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); +#endif + +#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE + +/** \internal */ +struct CacheSizes { + CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize); + m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize); + m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize); + } + + std::ptrdiff_t m_l1; + std::ptrdiff_t m_l2; + std::ptrdiff_t m_l3; +}; + +/** \internal */ +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) +{ + static CacheSizes m_cacheSizes; + + if(action==SetAction) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + eigen_internal_assert(l1!=0 && l2!=0); + m_cacheSizes.m_l1 = *l1; + m_cacheSizes.m_l2 = *l2; + m_cacheSizes.m_l3 = *l3; + } + else if(action==GetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + *l1 = m_cacheSizes.m_l1; + *l2 = m_cacheSizes.m_l2; + *l3 = m_cacheSizes.m_l3; + } + else + { + eigen_internal_assert(false); + } +} + +/* Helper for computeProductBlockingSizes. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ + +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + // Explanations: + // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and + // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed + // per mr x kc horizontal small panels where mr is the blocking size along the m dimension + // at the register level. This small horizontal panel has to stay within L1 cache. + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif + + if (num_threads > 1) { + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + kr = 8, + mr = Traits::mr, + nr = Traits::nr + }; + // Increasing k gives us more time to prefetch the content of the "C" + // registers. However once the latency is hidden there is no point in + // increasing the value of k, so we'll cap it at 320 (value determined + // experimentally). + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320)); + if (k_cache < k) { + k = k_cache - (k_cache % kr); + eigen_internal_assert(k > 0); + } + + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_internal_assert(n_cache >= static_cast<Index>(nr)); + n = n_cache - (n_cache % nr); + eigen_internal_assert(n > 0); + } else { + n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); + } + + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { + m = m_cache - (m_cache % mr); + eigen_internal_assert(m > 0); + } else { + m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the cache size to check the blocking strategy is not flawed +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 9*1024; + l2 = 32*1024; + l3 = 512*1024; +#endif + + // Early return for small problems because the computation below are time consuming for small problems. + // Perhaps it would make more sense to consider k*n*m?? + // Note that for very tiny problem, this function should be bypassed anyway + // because we use the coefficient-based implementation for them. + if((numext::maxi)(k,(numext::maxi)(m,n))<48) + return; + + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + + // ---- 1st level of blocking on L1, yields kc ---- + + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel + // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. + // We also include a register-level block of the result (mx x nr). + // (In an ideal world only the lhs panel would stay in L1) + // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: + const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index old_k = k; + if(k>max_kc) + { + // We are really blocking on the third dimension: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the result. + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- + + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: + // actual_l2 = max(l2, l3/nb_core_sharing_l3) + // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) + // For instance, it corresponds to 6MB of L3 shared among 4 cores. + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else + const Index actual_l2 = 1572864; // == 1.5 MB + #endif + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. + // The second half is implicitly reserved to access the result and lhs coefficients. + // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful + // to limit this growth: we bound nc to growth by a factor x1.5. + // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all, + // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space. + Index max_nc; + const Index lhs_bytes = m * k * sizeof(LhsScalar); + const Index remaining_l1 = l1- k_sub - lhs_bytes; + if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k) + { + // L1 blocking + max_nc = remaining_l1 / (k*sizeof(RhsScalar)); + } + else + { + // L2 blocking + max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + } + // WARNING Below, we assume that Traits::nr is a power of two. + Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + // We are really blocking over the columns: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the packed lhs. + // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + // So far, no blocking at all, i.e., kc==k, and nc==n. + // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete. + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + // problem is small enough to keep in L1 + // Let's choose m such that lhs's block fit in 1/3 of L1 + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + // we have both L2 and L3, and problem is small enough to be kept in L2 + // Let's choose m such that lhs's block fit in 1/3 of L2 + actual_lm = l2; + max_mc = (numext::mini<Index>)(576,max_mc); + } + Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + else if (mc==0) return; + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); + } + } +} + +template <typename Index> +inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) +{ +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + return true; + } +#else + EIGEN_UNUSED_VARIABLE(k) + EIGEN_UNUSED_VARIABLE(m) + EIGEN_UNUSED_VARIABLE(n) +#endif + return false; +} + +/** \brief Computes the blocking parameters for a m x k times k x n matrix product + * + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. + * + * The blocking size parameters may be evaluated: + * - either by a heuristic based on cache sizes; + * - or using fixed prescribed values (for testing purposes). + * + * \sa setCpuCacheSizes */ + +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + if (!useSpecificBlockingSizes(k, m, n)) { + evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads); + } +} + +template<typename LhsScalar, typename RhsScalar, typename Index> +inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads); +} + +template <typename RhsPacket, typename RhsPacketx4, int registers_taken> +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type; +}; + +template <typename Packet> +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + +template <int N, typename T1, typename T2, typename T3> +struct packet_conditional { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + ScalarPacket + +/* Vectorization logic + * real*real: unpack rhs to constant packets, ... + * + * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), + * storing each res packet into two packets (2x2), + * at the end combine them: swap the second and addsub them + * cf*cf : same but with 2x4 blocks + * cplx*real : unpack rhs to constant packets, ... + * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual + */ +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction must be 1 or 4 + nr = 4, + + // register block size along the M direction (currently, this one cannot be modified) + default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more + // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, + // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. + mr = Vectorizable ? 3*LhsPacketSize : default_mr, +#else + mr = default_mr, +#endif + + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket<RhsPacket> RhsPacketx4; + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad<RhsPacket>(b); + } + + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const + { + dest = pload<LhsPacketType>(a); + } + + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu<LhsPacketType>(a); + } + + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + { + conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c = cj.pmadd(a,b,c); +#else + tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp); +#endif + } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = pmadd(c,alpha,r); + } + + template<typename ResPacketHalf> + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } + +}; + +template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize> +{ +public: + typedef std::complex<RealScalar> LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif + + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket<RhsPacket> RhsPacketx4; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const + { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad<RhsPacket>(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const + { + eigen_internal_assert(RhsPacketSize<=8); + dest = pset1<RhsPacket>(*b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload<LhsPacket>(a); + } + + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu<LhsPacketType>(a); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + { + madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const + { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else + tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; + r = cj.pmadd(c,alpha,r); + } + +protected: +}; + +template<typename Packet> +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template<typename Packet> +DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +{ + DoublePacket<Packet> res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + +// note that for DoublePacket<RealPacket> the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + +template<typename Packet> +const DoublePacket<Packet>& +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0) +{ + return a; +} + +template<typename Packet> +DoublePacket<typename unpacket_traits<Packet>::half> +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket<typename unpacket_traits<Packet>::half> res; + typedef std::complex<typename unpacket_traits<Packet>::type> Cplx; + typedef typename packet_traits<Cplx>::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0) +{ + dest.first = pset1<RealPacket>(numext::real(*b)); + dest.second = pset1<RealPacket>(numext::imag(*b)); +} + +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits<Scalar>::Real RealScalar; + RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; + RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; + dest.first = ploadquad<RealPacket>(r); + dest.second = ploadquad<RealPacket>(i); +} + + +template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { + typedef DoublePacket<typename unpacket_traits<Packet>::half> half; +}; +// template<typename Packet> +// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +// { +// DoublePacket<Packet> res; +// res.first = padd(a.first, b.first); +// res.second = padd(a.second,b.second); +// return res; +// } + +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef std::complex<RealScalar> LhsScalar; + typedef std::complex<RealScalar> RhsScalar; + typedef std::complex<RealScalar> ResScalar; + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = unpacket_traits<RealPacket>::vectorizable + && unpacket_traits<ScalarPacket>::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1, + + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef DoublePacket<RealPacket> DoublePacketType; + + typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing; + typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; + typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket<RhsPacket> RhsPacketx4; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) + { + p.first = pset1<RealPacket>(RealScalar(0)); + p.second = pset1<RealPacket>(RealScalar(0)); + } + + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const + { + dest = pset1<ScalarPacket>(*b); + } + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const + { + dest.first = pset1<RealPacketType>(numext::real(*b)); + dest.second = pset1<RealPacketType>(numext::imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const + { + loadRhs(b, dest); + } + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + loadQuadToDoublePacket(b,dest); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + } + + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a)); + } + + template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType> + EIGEN_STRONG_INLINE + typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const + { + c.first = padd(pmul(a,b.first), c.first); + c.second = padd(pmul(a,b.second),c.second); + } + + template<typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const + { + c = cj.pmadd(a,b,c); + } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + + template<typename RealPacketType, typename ResPacketType> + EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const + { + // assemble c + ResPacketType tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); + } + + r = pmadd(tmp,alpha,r); + } + +protected: + conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; +}; + +template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad<RhsPacket>(b); + } + + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploaddup<LhsPacketType>(a); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + { + madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const + { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else + tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; + r = cj.pmadd(alpha,c,r); + } + +protected: + +}; + +/* optimized General packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; + + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + + typedef typename DataMapper::LinearMapper LinearMapper; + + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, + RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, +int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress> +struct last_row_process_16_packets +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + EIGEN_UNUSED_VARIABLE(res); + EIGEN_UNUSED_VARIABLE(straits); + EIGEN_UNUSED_VARIABLE(blA); + EIGEN_UNUSED_VARIABLE(blB); + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(endk); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j2); + EIGEN_UNUSED_VARIABLE(alpha); + EIGEN_UNUSED_VARIABLE(C0); + } +}; + + +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> { + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter; + + SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2); + SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha); + + if (depth - endk > 0) + { + // We have to handle the last row(s) of the rhs, which + // correspond to a half-packet + SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + + for (Index kk = endk; kk < depth; kk++) + { + SLhsPacketQuarter a0; + SRhsPacketQuarter b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + straits.madd(a0,b0,c0,b0, fix<0>); + blB += SwappedTraits::LhsProgress/4; + blA += 1; + } + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); + } + res.scatterPacket(i, j2, R); + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i<peelEnd; i+=LhsProgress) + { + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // We select a LhsProgress x nr micro block of res + // which is entirely stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + // To improve instruction pipelining, let's double the accumulation registers: + // even k will accumulate in C*, while odd k will accumulate in D*. + // This trick is crutial to get good performance with FMA, otherwise it is + // actually faster to perform separated MUL+ADD because of a naturally + // better instruction-level parallelism. + AccPacket D0, D1, D2, D3; + traits.initAcc(D0); + traits.initAcc(D1); + traits.initAcc(D2); + traits.initAcc(D3); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); + RhsPacketx4 rhs_panel; + RhsPacket T0; + + internal::prefetch(blB+(48+0)); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + internal::prefetch(blB+(48+16)); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + + blB += pk*4*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4"); + } + C0 = padd(C0,D0); + C1 = padd(C1,D1); + C2 = padd(C2,D2); + C3 = padd(C3,D3); + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + blB += 4*RhsProgress; + blA += LhsProgress; + } + + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0); + R1 = r1.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket<ResPacket>(0); + R1 = r3.template loadPacket<ResPacket>(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0; + traits.initAcc(C0); + + LinearMapper r0 = res.getLinearMapper(i, j2); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; + + for(Index k= 0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ + traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = r0.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + Traits traits; + SwappedTraits straits; + + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const int prefetch_res_offset = 32/sizeof(ResScalar); +// const Index depth2 = depth & ~1; + + //---------- Process 3 * LhsProgress rows at once ---------- + // This corresponds to 3*LhsProgress x nr register blocks. + // Usually, make sense only with FMA + if(mr>=3*Traits::LhsProgress) + { + // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth) + // and on each largest micro vertical panel of the rhs (depth * nr). + // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1. + // However, if depth is too small, we can extend the number of rows of these horizontal panels. + // This actual number of rows is computed as follow: + const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. + // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size + // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), + // or because we are testing specific blocking sizes. + const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); + for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows) + { + const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3); + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) + { + + // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 3 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7, + C8, C9, C10, C11; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); + // 15 registers are taken (12 for acc, 2 for lhs). + RhsPanel15 rhs_panel; + RhsPacket T0; + LhsPacket A2; + #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) + // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633 + // without this workaround A0, A1, and A2 are loaded in the same register, + // which is not good for pipelining + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); + #else + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND + #endif +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ + if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ + internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ + } /* Bug 953 */ \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ + traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) + + internal::prefetch(blB); + EIGEN_GEBP_ONESTEP(0); + EIGEN_GEBP_ONESTEP(1); + EIGEN_GEBP_ONESTEP(2); + EIGEN_GEBP_ONESTEP(3); + EIGEN_GEBP_ONESTEP(4); + EIGEN_GEBP_ONESTEP(5); + EIGEN_GEBP_ONESTEP(6); + EIGEN_GEBP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPanel15 rhs_panel; + RhsPacket T0; + LhsPacket A2; + EIGEN_GEBP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 3*Traits::LhsProgress; + } + +#undef EIGEN_GEBP_ONESTEP + + ResPacket R0, R1, R2; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); + } + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C4, C8; + traits.initAcc(C0); + traits.initAcc(C4); + traits.initAcc(C8); + + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1, A2; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); + RhsPacket B_0; +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += int(pk) * int(RhsProgress); + blA += int(pk) * 3 * int(Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 3*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1, R2; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + } + } + } + } + + //---------- Process 2 * LhsProgress rows at once ---------- + if(mr>=2*Traits::LhsProgress) + { + const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. + // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size + // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), + // or because we are testing specific blocking sizes. + Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); + + for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows) + { + Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2); + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) + { + + // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 2 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); + RhsPacketx4 rhs_panel; + RhsPacket T0; + + // NOTE: the begin/end asm comments below work around bug 935! + // but they are not enough for gcc>=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacketx4 rhs_panel; + RhsPacket T0; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 2*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + + ResPacket R0, R1, R2, R3; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); + } + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C4; + traits.initAcc(C0); + traits.initAcc(C4); + + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); + RhsPacket B_0, B1; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B1, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += int(pk) * int(RhsProgress); + blA += int(pk) * 2 * int(Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0, B1; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 2*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + } + } + } + } + //---------- Process 1 * LhsProgress rows at once ---------- + if(mr>=1*Traits::LhsProgress) + { + lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process remaining rows, 1 at once ---------- + if(peeled_mc_quarter<rows) + { + // loop on each panel of the rhs + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc_quarter; i<rows; i+=1) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + + // If LhsProgress is 8 or 16, it assumes that there is a + // half or quarter packet, respectively, of the same size as + // nr (which is currently 4) for the return type. + const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size; + const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size; + if ((SwappedTraits::LhsProgress % 4) == 0 && + (SwappedTraits::LhsProgress<=16) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) + { + SAccPacket C0, C1, C2, C3; + straits.initAcc(C0); + straits.initAcc(C1); + straits.initAcc(C2); + straits.initAcc(C3); + + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); + const Index endk = (depth/spk)*spk; + const Index endk4 = (depth/(spk*4))*(spk*4); + + Index k=0; + for(; k<endk4; k+=4*spk) + { + SLhsPacket A0,A1; + SRhsPacket B_0,B_1; + + straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); + + straits.loadRhsQuad(blA+0*spk, B_0); + straits.loadRhsQuad(blA+1*spk, B_1); + straits.madd(A0,B_0,C0,B_0, fix<0>); + straits.madd(A1,B_1,C1,B_1, fix<0>); + + straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); + straits.loadRhsQuad(blA+2*spk, B_0); + straits.loadRhsQuad(blA+3*spk, B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); + + blB += 4*SwappedTraits::LhsProgress; + blA += 4*spk; + } + C0 = padd(padd(C0,C1),padd(C2,C3)); + for(; k<endk; k+=spk) + { + SLhsPacket A0; + SRhsPacket B_0; + + straits.loadLhsUnaligned(blB, A0); + straits.loadRhsQuad(blA, B_0); + straits.madd(A0,B_0,C0,B_0, fix<0>); + + blB += SwappedTraits::LhsProgress; + blA += spk; + } + if(SwappedTraits::LhsProgress==8) + { + // Special case where we have to first reduce the accumulation register C0 + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; + + SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); + SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); + + if(depth-endk>0) + { + // We have to handle the last row of the rhs which corresponds to a half-packet + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux_half_dowto4(C0); + straits.madd(a0,b0,c0,b0, fix<0>); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(C0), alphav, R); + } + res.scatterPacket(i, j2, R); + } + else if (SwappedTraits::LhsProgress==16) + { + // Special case where we have to first reduce the + // accumulation register C0. We specialize the block in + // template form, so that LhsProgress < 16 paths don't + // fail to compile + last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p; + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + } + else + { + SResPacket R = res.template gatherPacket<SResPacket>(i, j2); + SResPacket alphav = pset1<SResPacket>(alpha); + straits.acc(C0, alphav, R); + res.scatterPacket(i, j2, R); + } + } + else // scalar path + { + // get a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k<depth; k++) + { + LhsScalar A0; + RhsScalar B_0, B_1; + + A0 = blA[k]; + + B_0 = blB[0]; + B_1 = blB[1]; + C0 = cj.pmadd(A0,B_0,C0); + C1 = cj.pmadd(A0,B_1,C1); + + B_0 = blB[2]; + B_1 = blB[3]; + C2 = cj.pmadd(A0,B_0,C2); + C3 = cj.pmadd(A0,B_1,C3); + + blB += 4; + } + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; + } + } + } + // remaining columns + for(Index j2=packet_cols4; j2<cols; j2++) + { + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc_quarter; i<rows; i+=1) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + // gets a 1 x 1 res block as registers + ResScalar C0(0); + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + for(Index k=0; k<depth; k++) + { + LhsScalar A0 = blA[k]; + RhsScalar B_0 = blB[k]; + C0 = cj.pmadd(A0, B_0, C0); + } + res(i, j2) += alpha * C0; + } + } + } + } + + +// pack a block of the lhs +// The traversal is as follow (mr==4): +// 0 4 8 12 ... +// 1 5 9 13 ... +// 2 6 10 14 ... +// 3 7 11 15 ... +// +// 16 20 24 28 ... +// 17 21 25 29 ... +// 18 22 26 30 ... +// 19 23 27 31 ... +// +// 32 33 34 35 ... +// 36 36 38 39 ... +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index count = 0; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; + + Index i=0; + + // Pack 3 packets + if(Pack1>=3*PacketSize) + { + for(; i<peeled_mc3; i+=3*PacketSize) + { + if(PanelMode) count += (3*PacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + Packet A, B, C; + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); + B = lhs.template loadPacket<Packet>(i+1*PacketSize, k); + C = lhs.template loadPacket<Packet>(i+2*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + pstore(blockA+count, cj.pconj(C)); count+=PacketSize; + } + if(PanelMode) count += (3*PacketSize) * (stride-offset-depth); + } + } + // Pack 2 packets + if(Pack1>=2*PacketSize) + { + for(; i<peeled_mc2; i+=2*PacketSize) + { + if(PanelMode) count += (2*PacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + Packet A, B; + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); + B = lhs.template loadPacket<Packet>(i+1*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + } + if(PanelMode) count += (2*PacketSize) * (stride-offset-depth); + } + } + // Pack 1 packets + if(Pack1>=1*PacketSize) + { + for(; i<peeled_mc1; i+=1*PacketSize) + { + if(PanelMode) count += (1*PacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + Packet A; + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); + count+=PacketSize; + } + if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); + } + } + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i<peeled_mc_half; i+=HalfPacketSize) + { + if(PanelMode) count += (HalfPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + HalfPacket A; + A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i<peeled_mc_quarter; i+=QuarterPacketSize) + { + if(PanelMode) count += (QuarterPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + QuarterPacket A; + A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if(Pack2<PacketSize && Pack2>1) + { + for(; i<peeled_mc0; i+=last_lhs_progress) + { + if(PanelMode) count += last_lhs_progress * offset; + + for(Index k=0; k<depth; k++) + for(Index w=0; w<last_lhs_progress; w++) + blockA[count++] = cj(lhs(i+w, k)); + + if(PanelMode) count += last_lhs_progress * (stride-offset-depth); + } + } + // Pack scalars + for(; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } +} + +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> +{ + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; + + Index i = 0; + int pack = Pack1; + int psize = PacketSize; + while(pack>0) + { + Index remaining_rows = rows-i; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; + for(; i<peeled_mc; i+=pack) + { + if(PanelMode) count += pack * offset; + + Index k=0; + if(pack>=psize && psize >= QuarterPacketSize) + { + const Index peeled_k = (depth/psize)*psize; + for(; k<peeled_k; k+=psize) + { + for (Index m = 0; m < pack; m += psize) + { + if (psize == PacketSize) { + PacketBlock<Packet> kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock<HalfPacket> kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock<QuarterPacket> kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } + } + count += psize*pack; + } + } + + for(; k<depth; k++) + { + Index w=0; + for(; w<pack-3; w+=4) + { + Scalar a(cj(lhs(i+w+0, k))), + b(cj(lhs(i+w+1, k))), + c(cj(lhs(i+w+2, k))), + d(cj(lhs(i+w+3, k))); + blockA[count++] = a; + blockA[count++] = b; + blockA[count++] = c; + blockA[count++] = d; + } + if(pack%4) + for(;w<pack;++w) + blockA[count++] = cj(lhs(i+w, k)); + } + + if(PanelMode) count += pack * (stride-offset-depth); + } + + pack -= psize; + Index left = rows - i; + if (pack <= 0) { + if (!gone_last && + (starting_pos == i || left >= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } + } + + for(; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } +} + +// copy a complete panel of the rhs +// this version is optimized for column major matrices +// The traversal order is as follow: (nr==4): +// 0 1 2 3 12 13 14 15 24 27 +// 4 5 6 7 16 17 18 19 25 28 +// 8 9 10 11 20 21 22 23 26 29 +// . . . . . . . . . . +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { PacketSize = packet_traits<Scalar>::size }; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + const Index peeled_k = (depth/PacketSize)*PacketSize; +// if(nr>=8) +// { +// for(Index j2=0; j2<packet_cols8; j2+=8) +// { +// // skip what we have before +// if(PanelMode) count += 8 * offset; +// const Scalar* b0 = &rhs[(j2+0)*rhsStride]; +// const Scalar* b1 = &rhs[(j2+1)*rhsStride]; +// const Scalar* b2 = &rhs[(j2+2)*rhsStride]; +// const Scalar* b3 = &rhs[(j2+3)*rhsStride]; +// const Scalar* b4 = &rhs[(j2+4)*rhsStride]; +// const Scalar* b5 = &rhs[(j2+5)*rhsStride]; +// const Scalar* b6 = &rhs[(j2+6)*rhsStride]; +// const Scalar* b7 = &rhs[(j2+7)*rhsStride]; +// Index k=0; +// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4 +// { +// for(; k<peeled_k; k+=PacketSize) { +// PacketBlock<Packet> kernel; +// for (int p = 0; p < PacketSize; ++p) { +// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); +// } +// ptranspose(kernel); +// for (int p = 0; p < PacketSize; ++p) { +// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); +// count+=PacketSize; +// } +// } +// } +// for(; k<depth; k++) +// { +// blockB[count+0] = cj(b0[k]); +// blockB[count+1] = cj(b1[k]); +// blockB[count+2] = cj(b2[k]); +// blockB[count+3] = cj(b3[k]); +// blockB[count+4] = cj(b4[k]); +// blockB[count+5] = cj(b5[k]); +// blockB[count+6] = cj(b6[k]); +// blockB[count+7] = cj(b7[k]); +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + + if(nr>=4) + { + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) + { + // skip what we have before + if(PanelMode) count += 4 * offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k=0; + if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ?? + { + for(; k<peeled_k; k+=PacketSize) { + PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; + kernel.packet[0 ] = dm0.template loadPacket<Packet>(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k); + ptranspose(kernel); + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize])); + count+=4*PacketSize; + } + } + for(; k<depth; k++) + { + blockB[count+0] = cj(dm0(k)); + blockB[count+1] = cj(dm1(k)); + blockB[count+2] = cj(dm2(k)); + blockB[count+3] = cj(dm3(k)); + count += 4; + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); + } + } + + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols4; j2<cols; ++j2) + { + if(PanelMode) count += offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(dm0(k)); + count += 1; + } + if(PanelMode) count += (stride-offset-depth); + } +} + +// this version is optimized for row major matrices +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> +{ + typedef typename packet_traits<Scalar>::type Packet; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size}; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) + { + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + + // if(nr>=8) + // { + // for(Index j2=0; j2<packet_cols8; j2+=8) + // { + // // skip what we have before + // if(PanelMode) count += 8 * offset; + // for(Index k=0; k<depth; k++) + // { + // if (PacketSize==8) { + // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + // pstoreu(blockB+count, cj.pconj(A)); + // } else if (PacketSize==4) { + // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); + // pstoreu(blockB+count, cj.pconj(A)); + // pstoreu(blockB+count+PacketSize, cj.pconj(B)); + // } else { + // const Scalar* b0 = &rhs[k*rhsStride + j2]; + // blockB[count+0] = cj(b0[0]); + // blockB[count+1] = cj(b0[1]); + // blockB[count+2] = cj(b0[2]); + // blockB[count+3] = cj(b0[3]); + // blockB[count+4] = cj(b0[4]); + // blockB[count+5] = cj(b0[5]); + // blockB[count+6] = cj(b0[6]); + // blockB[count+7] = cj(b0[7]); + // } + // count += 8; + // } + // // skip what we have after + // if(PanelMode) count += 8 * (stride-offset-depth); + // } + // } + if(nr>=4) + { + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) + { + // skip what we have before + if(PanelMode) count += 4 * offset; + for(Index k=0; k<depth; k++) + { + if (PacketSize==4) { + Packet A = rhs.template loadPacket<Packet>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize==4) { + HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize==4) { + QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); + } + } + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols4; j2<cols; ++j2) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(rhs(k, j2)); + count += 1; + } + if(PanelMode) count += stride-offset-depth; + } + } +}; + +} // end namespace internal + +/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l1CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l1; +} + +/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l2CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l2; +} + +/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ +rs. +* \sa setCpuCacheSize */ +inline std::ptrdiff_t l3CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l3; +} + +/** Set the cpu L1 and L2 cache sizes (in bytes). + * These values are use to adjust the size of the blocks + * for the algorithms working per blocks. + * + * \sa computeProductBlockingSizes */ +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) +{ + internal::manage_caching_sizes(SetAction, &l1, &l2, &l3); +} + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_BLOCK_PANEL_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h new file mode 100644 index 000000000..caa65fccc --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -0,0 +1,517 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H +#define EIGEN_GENERAL_MATRIX_MATRIX_H + +namespace Eigen { + +namespace internal { + +template<typename _LhsScalar, typename _RhsScalar> class level3_blocking; + +/* Specialization for a row-major destination matrix => simple transposition of the product */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride> +{ + typedef gebp_traits<RhsScalar,LhsScalar> Traits; + + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, + ResScalar* res, Index resIncr, Index resStride, + ResScalar alpha, + level3_blocking<RhsScalar,LhsScalar>& blocking, + GemmParallelInfo<Index>* info = 0) + { + // transpose the product such that the result is column major + general_matrix_matrix_product<Index, + RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, + LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, + ColMajor,ResInnerStride> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info); + } +}; + +/* Specialization for a col-major destination matrix + * => Blocking algorithm following Goto's paper */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride> +{ + +typedef gebp_traits<LhsScalar,RhsScalar> Traits; + +typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; +static void run(Index rows, Index cols, Index depth, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, + ResScalar alpha, + level3_blocking<LhsScalar,RhsScalar>& blocking, + GemmParallelInfo<Index>* info = 0) +{ + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper; + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction + + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + +#ifdef EIGEN_HAS_OPENMP + if(info) + { + // this is the parallel version! + int tid = omp_get_thread_num(); + int threads = omp_get_num_threads(); + + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); + + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); + + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... + for(Index k=0; k<depth; k+=kc) + { + const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' + + // In order to reduce the chance that a thread has to wait for the other, + // let's start by packing B'. + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); + + // Pack A_k to A' in a parallel fashion: + // each thread packs the sub block A_k,i to A'_i where i is the thread id. + + // However, before copying to A'_i, we have to make sure that no other thread is still using it, + // i.e., we test that info[tid].users equals 0. + // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. + while(info[tid].users!=0) {} + info[tid].users = threads; + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); + + // Notify the other threads that the part A'_i is ready to go. + info[tid].sync = k; + + // Computes C_i += A' * B' per A'_i + for(int shift=0; shift<threads; ++shift) + { + int i = (tid+shift)%threads; + + // At this point we have to make sure that A'_i has been updated by the thread i, + // we use testAndSetOrdered to mimic a volatile access. + // However, no need to wait for the B' part which has been updated by the current thread! + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + } + + // Then keep going as usual with the remaining B' + for(Index j=nc; j<cols; j+=nc) + { + const Index actual_nc = (std::min)(j+nc,cols)-j; + + // pack B_k,j to B' + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); + + // C_j += A' * B' + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); + } + + // Release all the sub blocks A'_i of A' for the current thread, + // i.e., we simply decrement the number of users by 1 + for(Index i=0; i<threads; ++i) +#if !EIGEN_HAS_CXX11_ATOMIC + #pragma omp atomic +#endif + info[i].users -= 1; + } + } + else +#endif // EIGEN_HAS_OPENMP + { + EIGEN_UNUSED_VARIABLE(info); + + // this is the sequential version! + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*nc; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; + + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,rows)-i2; + + for(Index k2=0; k2<depth; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2<cols; j2+=nc) + { + const Index actual_nc = (std::min)(j2+nc,cols)-j2; + + // We pack the rhs's block into a sequential chunk of memory (L2 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). + if((!pack_rhs_once) || i2==0) + pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); + + // Everything is packed, we can now call the panel * block kernel: + gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); + } + } + } + } +} + +}; + +/********************************************************************************* +* Specialization of generic_product_impl for "large" GEMM, i.e., +* implementation of the high level wrapper to general_matrix_matrix_product +**********************************************************************************/ + +template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> +struct gemm_functor +{ + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) + : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) + {} + + void initParallelSession(Index num_threads) const + { + m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads); + m_blocking.allocateA(); + } + + void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const + { + if(cols==-1) + cols = m_rhs.cols(); + + Gemm::run(rows, cols, m_lhs.cols(), + &m_lhs.coeffRef(row,0), m_lhs.outerStride(), + &m_rhs.coeffRef(0,col), m_rhs.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(), + m_actualAlpha, m_blocking, info); + } + + typedef typename Gemm::Traits Traits; + + protected: + const Lhs& m_lhs; + const Rhs& m_rhs; + Dest& m_dest; + Scalar m_actualAlpha; + BlockingType& m_blocking; +}; + +template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1, +bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space; + +template<typename _LhsScalar, typename _RhsScalar> +class level3_blocking +{ + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + + protected: + LhsScalar* m_blockA; + RhsScalar* m_blockB; + + Index m_mc; + Index m_nc; + Index m_kc; + + public: + + level3_blocking() + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) + {} + + inline Index mc() const { return m_mc; } + inline Index nc() const { return m_nc; } + inline Index kc() const { return m_kc; } + + inline LhsScalar* blockA() { return m_blockA; } + inline RhsScalar* blockB() { return m_blockB; } +}; + +template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */> + : public level3_blocking< + typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, + typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> +{ + enum { + Transpose = StorageOrder==RowMajor, + ActualRows = Transpose ? MaxCols : MaxRows, + ActualCols = Transpose ? MaxRows : MaxCols + }; + typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; + typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + enum { + SizeA = ActualRows * MaxDepth, + SizeB = ActualCols * MaxDepth + }; + +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES + EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB]; +#else + EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; + EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; +#endif + + public: + + gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/) + { + this->m_mc = ActualRows; + this->m_nc = ActualCols; + this->m_kc = MaxDepth; +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES + this->m_blockA = m_staticA; + this->m_blockB = m_staticB; +#else + this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); + this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); +#endif + } + + void initParallel(Index, Index, Index, Index) + {} + + inline void allocateA() {} + inline void allocateB() {} + inline void allocateAll() {} +}; + +template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false> + : public level3_blocking< + typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, + typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> +{ + enum { + Transpose = StorageOrder==RowMajor + }; + typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; + typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + Index m_sizeA; + Index m_sizeB; + + public: + + gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + if(l3_blocking) + { + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads); + } + else // no l3 blocking + { + Index n = this->m_nc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads); + } + + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + + void initParallel(Index rows, Index cols, Index depth, Index num_threads) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); + Index m = this->m_mc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + + void allocateA() + { + if(this->m_blockA==0) + this->m_blockA = aligned_new<LhsScalar>(m_sizeA); + } + + void allocateB() + { + if(this->m_blockB==0) + this->m_blockB = aligned_new<RhsScalar>(m_sizeB); + } + + void allocateAll() + { + allocateA(); + allocateB(); + } + + ~gemm_blocking_space() + { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; + +} // end namespace internal + +namespace internal { + +template<typename Lhs, typename Rhs> +struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> + : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> > +{ + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; + + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; + + typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct; + + template<typename Dst> + static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>()); + else + { + dst.setZero(); + scaleAndAddTo(dst, lhs, rhs, Scalar(1)); + } + } + + template<typename Dst> + static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>()); + else + scaleAndAddTo(dst,lhs, rhs, Scalar(1)); + } + + template<typename Dst> + static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>()); + else + scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); + } + + template<typename Dest> + static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) + return; + + if (dst.cols() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> + (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h new file mode 100644 index 000000000..6ba0d9bdb --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -0,0 +1,317 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H + +namespace Eigen { + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update; + +namespace internal { + +/********************************************************************** +* This file implements a general A * B product while +* evaluating only one triangular part of the product. +* This is a more general version of self adjoint product (C += A A^T) +* as the level 3 SYRK Blas routine. +**********************************************************************/ + +// forward declarations (defined at the end of this file) +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo> +struct tribb_kernel; + +/* Optimized matrix-matrix product evaluating only one triangular half */ +template <typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized> +struct general_matrix_matrix_triangular_product; + +// as usual if the result is row major => we transpose the product +template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version> +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride, + const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) + { + general_matrix_matrix_triangular_product<Index, + RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, + LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, + ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking); + } +}; + +template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version> +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, + const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) + { + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); + + Index kc = blocking.kc(); + Index mc = (std::min)(size,blocking.mc()); + + // !!! mc must be a multiple of nr: + if(mc > Traits::nr) + mc = (mc/Traits::nr)*Traits::nr; + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb; + + for(Index k2=0; k2<depth; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // note that the actual rhs is the transpose/adjoint of mat + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size); + + for(Index i2=0; i2<size; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,size)-i2; + + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); + + // the selected actual_mc * size panel of res is split into three different part: + // 1 - before the diagonal => processed with gebp or skipped + // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel + // 3 - after the diagonal => processed with gebp or skipped + if (UpLo==Lower) + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + + sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + + if (UpLo==Upper) + { + Index j2 = i2+actual_mc; + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); + } + } + } + } +}; + +// Optimized packed Block * packed Block product kernel evaluating only one given triangular part +// This kernel is built on top of the gebp kernel: +// - the current destination block is processed per panel of actual_mc x BlockSize +// where BlockSize is set to the minimal value allowing gebp to be as fast as possible +// - then, as usual, each panel is split into three parts along the diagonal, +// the sub blocks above and below the diagonal are processed as usual, +// while the triangular block overlapping the diagonal is evaluated into a +// small temporary buffer which is then accumulated into the result using a +// triangular traversal. +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo> +struct tribb_kernel +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; + typedef typename Traits::ResScalar ResScalar; + + enum { + BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret + }; + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + { + typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper; + ResMapper res(_res, resStride, resIncr); + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1; + gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2; + + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert())); + + // let's process the block per panel of actual_mc x BlockSize, + // again, each is split into three parts, etc. + for (Index j=0; j<size; j+=BlockSize) + { + Index actualBlockSize = std::min<Index>(BlockSize,size - j); + const RhsScalar* actual_b = blockB+j*depth; + + if(UpLo==Upper) + gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + + // selfadjoint micro block + { + Index i = j; + buffer.setZero(); + // 1 - apply the kernel on the temporary buffer + gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + + // 2 - triangular accumulation + for(Index j1=0; j1<actualBlockSize; ++j1) + { + typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1); + for(Index i1=UpLo==Lower ? j1 : 0; + UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) + r(i1) += buffer(i1,j1); + } + } + + if(UpLo==Lower) + { + Index i = j+actualBlockSize; + gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, + depth, actualBlockSize, alpha, -1, -1, 0, 0); + } + } + } +}; + +} // end namespace internal + +// high level API + +template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct> +struct general_product_to_triangular_selector; + + +template<typename MatrixType, typename ProductType, int UpLo> +struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true> +{ + static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) + { + typedef typename MatrixType::Scalar Scalar; + + typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs; + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; + typedef typename internal::remove_all<ActualLhs>::type _ActualLhs; + typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + + typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; + typedef typename internal::remove_all<ActualRhs>::type _ActualRhs; + typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + + if(!beta) + mat.template triangularView<UpLo>().setZero(); + + enum { + StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor, + UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1, + UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1 + }; + + internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs; + ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(), + (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data())); + if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs; + + internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs; + ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(), + (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data())); + if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; + + + selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo, + LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex> + ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha); + } +}; + +template<typename MatrixType, typename ProductType, int UpLo> +struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false> +{ + static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) + { + typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs; + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; + typedef typename internal::remove_all<ActualLhs>::type _ActualLhs; + typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + + typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; + typedef typename internal::remove_all<ActualRhs>::type _ActualRhs; + typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + + typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + + if(!beta) + mat.template triangularView<UpLo>().setZero(); + + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 + }; + + Index size = mat.cols(); + if(SkipDiag) + size--; + Index depth = actualLhs.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + + internal::general_matrix_matrix_triangular_product<Index, + typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)> + ::run(size, depth, + &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), + &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0), + mat.innerStride(), mat.outerStride(), actualAlpha, blocking); + } +}; + +template<typename MatrixType, unsigned int UpLo> +template<typename ProductType> +EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +{ + EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); + eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); + + general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); + + return derived(); +} + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h new file mode 100644 index 000000000..9a650ec23 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -0,0 +1,145 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Level 3 BLAS SYRK/HERK implementation. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H + +namespace Eigen { + +namespace internal { + +template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo> +struct general_matrix_matrix_rankupdate : + general_matrix_matrix_triangular_product< + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; + + +// try to go to BLAS specialization +#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \ +template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ +struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ + Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ + { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ + general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + } else { \ + general_matrix_matrix_triangular_product<Index, \ + Scalar, LhsStorageOrder, ConjugateLhs, \ + Scalar, RhsStorageOrder, ConjugateRhs, \ + ColMajor, 1, UpLo, BuiltIn> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \ + } \ + } \ +}; + +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float) +// TODO handle complex cases +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex) + +// SYRK for float/double +#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \ +template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ +struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ + enum { \ + IsLower = (UpLo&Lower) == Lower, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ + }; \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \ +\ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ + char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ + EIGTYPE beta(1); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ + } \ +}; + +// HERK for complex data +#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \ +template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ +struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ + enum { \ + IsLower = (UpLo&Lower) == Lower, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ + }; \ + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \ +\ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ + char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \ + RTYPE alpha_, beta_; \ + const EIGTYPE* a_ptr; \ +\ + alpha_ = alpha.real(); \ + beta_ = 1.0; \ +/* Copy with conjugation in some cases*/ \ + MatrixType a; \ + if (conjA) { \ + Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \ + a = mapA.conjugate(); \ + lda = a.outerStride(); \ + a_ptr = a.data(); \ + } else a_ptr=lhs; \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk) +#else +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) +#endif + +// TODO hanlde complex cases +// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) +// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_) + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h new file mode 100644 index 000000000..71abf4013 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -0,0 +1,124 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * General matrix-matrix product functionality based on ?GEMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements general matrix-matrix multiplication using BLAS +* gemm function via partial specialization of +* general_matrix_matrix_product::run(..) method for float, double, +* std::complex<float> and std::complex<double> types +**********************************************************************/ + +// gemm specialization + +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \ +template< \ + typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \ +{ \ +typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \ +\ +static void run(Index rows, Index cols, Index depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ + EIGTYPE alpha, \ + level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \ + GemmParallelInfo<Index>* /*info = 0*/) \ +{ \ + using std::conj; \ +\ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + char transa, transb; \ + BlasIndex m, n, k, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX a_tmp, b_tmp; \ +\ +/* Set transpose options */ \ + transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ + transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set m, n, k */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ + k = convert_index<BlasIndex>(depth); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ +\ +/* Set a, b, c */ \ + if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ + a_tmp = lhs.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else a = _lhs; \ +\ + if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ + b_tmp = rhs.conjugate(); \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ + } else b = _rhs; \ +\ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +}}; + +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h new file mode 100644 index 000000000..dfb6aebce --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h @@ -0,0 +1,518 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H +#define EIGEN_GENERAL_MATRIX_VECTOR_H + +namespace Eigen { + +namespace internal { + +enum GEMVPacketSizeType { + GEMVPacketFull = 0, + GEMVPacketHalf, + GEMVPacketQuarter +}; + +template <int N, typename T1, typename T2, typename T3> +struct gemv_packet_cond { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; }; + +template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull> +class gemv_traits +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename gemv_packet_cond<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); +#undef PACKET_DECL_COND_PREFIX + +public: + enum { + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && + unpacket_traits<_RhsPacket>::vectorizable && + int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + }; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; +}; + + +/* Optimized col-major matrix * vector product: + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. + * + * Mixing type logic: C += alpha * A * B + * | A | B |alpha| comments + * |real |cplx |cplx | no vectorization + * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization + * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp + * |cplx |real |real | optimal case, vectorization possible via real-cplx mul + * + * The same reasoning apply for the transposed case. + */ +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> +{ + typedef gemv_traits<LhsScalar,RhsScalar> Traits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; + + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha); +}; + +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr==1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; + conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; + const Index n_half = rows-1*ResPacketSizeHalf+1; + const Index n_quarter = rows-1*ResPacketSizeQuarter+1; + + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1<ResPacket>(alpha); + ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha); + ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha); + + for(Index j2=0; j2<cols; j2+=block_cols) + { + Index jend = numext::mini(j2+block_cols,cols); + Index i=0; + for(; i<n8; i+=ResPacketSize*8) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)), + c4 = pset1<ResPacket>(ResScalar(0)), + c5 = pset1<ResPacket>(ResScalar(0)), + c6 = pset1<ResPacket>(ResScalar(0)), + c7 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7))); + } + if(i<n4) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); + + i+=ResPacketSize*4; + } + if(i<n3) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); + + i+=ResPacketSize*3; + } + if(i<n2) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i<n1) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + if(HasHalf && i<n_half) + { + ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) + { + RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0)); + c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0))); + i+=ResPacketSizeHalf; + } + if(HasQuarter && i<n_quarter) + { + ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) + { + RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0)); + c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0))); + i+=ResPacketSizeQuarter; + } + for(;i<rows;++i) + { + ResScalar c0(0); + for(Index j=j2; j<jend; j+=1) + c0 += cj.pmul(lhs(i,j), rhs(j,0)); + res[i] += alpha*c0; + } + } +} + +/* Optimized row-major matrix * vector product: + * This algorithm processes 4 rows at once that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: + * - alpha is always a complex (or converted to a complex) + * - no vectorization + */ +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> +{ + typedef gemv_traits<LhsScalar,RhsScalar> Traits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; + + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha); +}; + +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + + eigen_internal_assert(rhs.stride()==1); + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; + conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + LhsPacketSizeHalf = HalfTraits::LhsPacketSize, + LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + Index i=0; + for(; i<n8; i+=8) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)), + c4 = pset1<ResPacket>(ResScalar(0)), + c5 = pset1<ResPacket>(ResScalar(0)), + c6 = pset1<ResPacket>(ResScalar(0)), + c7 = pset1<ResPacket>(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); + + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j<cols; ++j) + { + RhsScalar b0 = rhs(j,0); + + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + cc2 += cj.pmul(lhs(i+2,j), b0); + cc3 += cj.pmul(lhs(i+3,j), b0); + cc4 += cj.pmul(lhs(i+4,j), b0); + cc5 += cj.pmul(lhs(i+5,j), b0); + cc6 += cj.pmul(lhs(i+6,j), b0); + cc7 += cj.pmul(lhs(i+7,j), b0); + } + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; + } + for(; i<n4; i+=4) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); + + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; j<cols; ++j) + { + RhsScalar b0 = rhs(j,0); + + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + cc2 += cj.pmul(lhs(i+2,j), b0); + cc3 += cj.pmul(lhs(i+3,j), b0); + } + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + } + for(; i<n2; i+=2) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); + + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j<cols; ++j) + { + RhsScalar b0 = rhs(j,0); + + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + } + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + } + for(; i<rows; ++i) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)); + ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0)); + ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0); + } + ResScalar cc0 = predux(c0); + if (HasHalf) { + for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + { + RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0); + c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h); + } + cc0 += predux(c0_h); + } + if (HasQuarter) { + for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) + { + RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0); + c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q); + } + cc0 += predux(c0_q); + } + for(; j<cols; ++j) + { + cc0 += cj.pmul(lhs(i,j), rhs(j,0)); + } + res[i*resIncr] += alpha*cc0; + } +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_VECTOR_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h new file mode 100644 index 000000000..6e36c2b3c --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -0,0 +1,136 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * General matrix-vector product functionality based on ?GEMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements general matrix-vector multiplication using BLAS +* gemv function via partial specialization of +* general_matrix_vector_product::run(..) method for float, double, +* std::complex<float> and std::complex<double> types +**********************************************************************/ + +// gemv specialization + +template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct general_matrix_vector_product_gemv; + +#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \ +template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ +struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \ +static void run( \ + Index rows, Index cols, \ + const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \ + const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \ + Scalar* res, Index resIncr, Scalar alpha) \ +{ \ + if (ConjugateLhs) { \ + general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \ + rows, cols, lhs, rhs, res, resIncr, alpha); \ + } else { \ + general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ + } \ +} \ +}; \ +template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ +struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \ +static void run( \ + Index rows, Index cols, \ + const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \ + const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \ + Scalar* res, Index resIncr, Scalar alpha) \ +{ \ + general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ +} \ +}; \ + +EIGEN_BLAS_GEMV_SPECIALIZE(double) +EIGEN_BLAS_GEMV_SPECIALIZE(float) +EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) + +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ +template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ +struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ +{ \ +typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\ +\ +static void run( \ + Index rows, Index cols, \ + const EIGTYPE* lhs, Index lhsStride, \ + const EIGTYPE* rhs, Index rhsIncr, \ + EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ +{ \ + BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \ + lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \ + const EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ + char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ + if (LhsStorageOrder==RowMajor) { \ + m = convert_index<BlasIndex>(cols); \ + n = convert_index<BlasIndex>(rows); \ + }\ + GEMVVector x_tmp; \ + if (ConjugateRhs) { \ + Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \ + x_tmp=map_x.conjugate(); \ + x_ptr=x_tmp.data(); \ + incx=1; \ + } else x_ptr=rhs; \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ +}\ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h new file mode 100644 index 000000000..8f91879e4 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h @@ -0,0 +1,180 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PARALLELIZER_H +#define EIGEN_PARALLELIZER_H + +#if EIGEN_HAS_CXX11_ATOMIC +#include <atomic> +#endif + +namespace Eigen { + +namespace internal { + +/** \internal */ +inline void manage_multi_threading(Action action, int* v) +{ + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads) + + if(action==SetAction) + { + eigen_internal_assert(v!=0); + m_maxThreads = *v; + } + else if(action==GetAction) + { + eigen_internal_assert(v!=0); + #ifdef EIGEN_HAS_OPENMP + if(m_maxThreads>0) + *v = m_maxThreads; + else + *v = omp_get_max_threads(); + #else + *v = 1; + #endif + } + else + { + eigen_internal_assert(false); + } +} + +} + +/** Must be call first when calling Eigen from multiple threads */ +inline void initParallel() +{ + int nbt; + internal::manage_multi_threading(GetAction, &nbt); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); +} + +/** \returns the max number of threads reserved for Eigen + * \sa setNbThreads */ +inline int nbThreads() +{ + int ret; + internal::manage_multi_threading(GetAction, &ret); + return ret; +} + +/** Sets the max number of threads reserved for Eigen + * \sa nbThreads */ +inline void setNbThreads(int v) +{ + internal::manage_multi_threading(SetAction, &v); +} + +namespace internal { + +template<typename Index> struct GemmParallelInfo +{ + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC + std::atomic<Index> sync; + std::atomic<int> users; +#else + Index volatile sync; + int volatile users; +#endif + + Index lhs_start; + Index lhs_length; +}; + +template<bool Condition, typename Functor, typename Index> +void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose) +{ + // TODO when EIGEN_USE_BLAS is defined, + // we should still enable OMP for other scalar types + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) + // FIXME the transpose variable is only needed to properly split + // the matrix product when multithreading is enabled. This is a temporary + // fix to support row-major destination matrices. This whole + // parallelizer mechanism has to be redesigned anyway. + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(transpose); + func(0,rows, 0,cols); +#else + + // Dynamically check whether we should enable or disable OpenMP. + // The conditions are: + // - the max number of threads we can create is greater than 1 + // - we are not already in a parallel code + // - the sizes are large enough + + // compute the maximal number of threads from the size of the product: + // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once. + Index size = transpose ? rows : cols; + Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr); + + // compute the maximal number of threads from the total amount of work: + double work = static_cast<double>(rows) * static_cast<double>(cols) * + static_cast<double>(depth); + double kMinTaskSize = 50000; // FIXME improve this heuristic. + pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) )); + + // compute the number of threads we are going to use + Index threads = std::min<Index>(nbThreads(), pb_max_threads); + + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, + // then abort multi-threading + // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? + if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) + return func(0,rows, 0,cols); + + Eigen::initParallel(); + func.initParallelSession(threads); + + if(transpose) + std::swap(rows,cols); + + ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0); + + #pragma omp parallel num_threads(threads) + { + Index i = omp_get_thread_num(); + // Note that the actual number of threads might be lower than the number of request ones. + Index actual_threads = omp_get_num_threads(); + + Index blockCols = (cols / actual_threads) & ~Index(0x3); + Index blockRows = (rows / actual_threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; + + Index r0 = i*blockRows; + Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; + + Index c0 = i*blockCols; + Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; + + info[i].lhs_start = r0; + info[i].lhs_length = actualBlockRows; + + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); + } +#endif +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PARALLELIZER_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h new file mode 100644 index 000000000..33ecf10f6 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -0,0 +1,544 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H + +namespace Eigen { + +namespace internal { + +// pack a selfadjoint block diagonal for use with the gebp_kernel +template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder> +struct symm_pack_lhs +{ + template<int BlockRows> inline + void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count) + { + // normal copy + for(Index k=0; k<i; k++) + for(Index w=0; w<BlockRows; w++) + blockA[count++] = lhs(i+w,k); // normal + // symmetric copy + Index h = 0; + for(Index k=i; k<i+BlockRows; k++) + { + for(Index w=0; w<h; w++) + blockA[count++] = numext::conj(lhs(k, i+w)); // transposed + + blockA[count++] = numext::real(lhs(k,k)); // real (diagonal) + + for(Index w=h+1; w<BlockRows; w++) + blockA[count++] = lhs(i+w, k); // normal + ++h; + } + // transposed copy + for(Index k=i+BlockRows; k<cols; k++) + for(Index w=0; w<BlockRows; w++) + blockA[count++] = numext::conj(lhs(k, i+w)); // transposed + } + void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) + { + typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); + Index count = 0; + //Index peeled_mc3 = (rows/Pack1)*Pack1; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; + + if(Pack1>=3*PacketSize) + for(Index i=0; i<peeled_mc3; i+=3*PacketSize) + pack<3*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=2*PacketSize) + for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize) + pack<2*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=1*PacketSize) + for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) + pack<1*PacketSize>(blockA, lhs, cols, i, count); + + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize) + pack<HalfPacketSize>(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize) + pack<QuarterPacketSize>(blockA, lhs, cols, i, count); + + // do the same with mr==1 + for(Index i=peeled_mc_quarter; i<rows; i++) + { + for(Index k=0; k<i; k++) + blockA[count++] = lhs(i, k); // normal + + blockA[count++] = numext::real(lhs(i, i)); // real (diagonal) + + for(Index k=i+1; k<cols; k++) + blockA[count++] = numext::conj(lhs(k, i)); // transposed + } + } +}; + +template<typename Scalar, typename Index, int nr, int StorageOrder> +struct symm_pack_rhs +{ + enum { PacketSize = packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + Index end_k = k2 + rows; + Index count = 0; + const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + + // first part: normal case + for(Index j2=0; j2<k2; j2+=nr) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + if (nr>=4) + { + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } + count += nr; + } + } + + // second part: diagonal block + Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) + { + for(Index j2=k2; j2<end8; j2+=8) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + // symmetric + Index h = 0; + for(Index k=j2; k<j2+8; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); + + blockB[count+h] = numext::real(rhs(k,k)); + + // transpose + for (Index w=h+1 ; w<8; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 8; + ++h; + } + // normal + for(Index k=j2+8; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; + } + // symmetric + Index h = 0; + for(Index k=j2; k<j2+4; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); + + blockB[count+h] = numext::real(rhs(k,k)); + + // transpose + for (Index w=h+1 ; w<4; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 4; + ++h; + } + // normal + for(Index k=j2+4; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + count += 4; + } + } + } + + // third part: transposed + if(nr>=8) + { + for(Index j2=k2+rows; j2<packet_cols8; j2+=8) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; + } + } + } + + // copy the remaining columns one at a time (=> the same with nr==1) + for(Index j2=packet_cols4; j2<cols; ++j2) + { + // transpose + Index half = (std::min)(end_k,j2); + for(Index k=k2; k<half; k++) + { + blockB[count] = numext::conj(rhs(j2,k)); + count += 1; + } + + if(half==j2 && half<k2+rows) + { + blockB[count] = numext::real(rhs(j2,j2)); + count += 1; + } + else + half--; + + // normal + for(Index k=half+1; k<k2+rows; k++) + { + blockB[count] = rhs(k,j2); + count += 1; + } + } + } +}; + +/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of + * the general matrix matrix product. + */ +template <typename Scalar, typename Index, + int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResStorageOrder, int ResInnerStride> +struct product_selfadjoint_matrix; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride> +{ + + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + product_selfadjoint_matrix<Scalar, Index, + EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), + EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), + ColMajor,ResInnerStride> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); + } +}; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride> +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); +}; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + Index size = rows; + + typedef gebp_traits<Scalar,Scalar> Traits; + + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc + kc = (std::min)(kc,mc); + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + + for(Index k2=0; k2<size; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,size)-k2; + + // we have selected one row panel of rhs and one column panel of lhs + // pack rhs's panel into a sequential chunk of memory + // and expand each coeff to a constant packet for further reuse + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols); + + // the select lhs's panel has to be split in three different parts: + // 1 - the transposed panel above the diagonal block => transposed packed copy + // 2 - the diagonal block => special packed copy + // 3 - the panel below the diagonal block => generic packed copy + for(Index i2=0; i2<k2; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,k2)-i2; + // transposed packed copy + pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc); + + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + // the block diagonal + { + const Index actual_mc = (std::min)(k2+kc,size)-k2; + // symmetric packed copy + pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + + for(Index i2=k2+kc; i2<size; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,size)-i2; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); + + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } + +// matrix * selfadjoint product +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride> +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); +}; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + Index size = cols; + + typedef gebp_traits<Scalar,Scalar> Traits; + + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride, resIncr); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; + symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + + for(Index k2=0; k2<size; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,size)-k2; + + pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2); + + // => GEPP + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,rows)-i2; + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); + + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } + +} // end namespace internal + +/*************************************************************************** +* Wrapper to product_selfadjoint_matrix +***************************************************************************/ + +namespace internal { + +template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> +struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false> +{ + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + + enum { + LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, + LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, + RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, + RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint + }; + + template<typename Dest> + static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + { + eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); + + internal::product_selfadjoint_matrix<Scalar, Index, + EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), + EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), + internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime> + ::run( + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info + actualAlpha, blocking // alpha + ); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h new file mode 100644 index 000000000..61396dbdf --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -0,0 +1,295 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + + +/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ + +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \ +{\ +\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + char side='L', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ +\ +/* Set transpose options */ \ +/* Set m, n, k */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ +\ +/* Set a, b, c */ \ + if (LhsStorageOrder==RowMajor) uplo='U'; \ + a = _lhs; \ +\ + if (RhsStorageOrder==RowMajor) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.adjoint(); \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ + } else b = _rhs; \ +\ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + + +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \ +{\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + char side='L', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ + Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \ +\ +/* Set transpose options */ \ +/* Set m, n, k */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ +\ +/* Set a, b, c */ \ + if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ + Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \ + a_tmp = lhs.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else a = _lhs; \ + if (LhsStorageOrder==RowMajor) uplo='U'; \ +\ + if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \ + b = _rhs; } \ + else { \ + if (RhsStorageOrder==ColMajor && ConjugateRhs) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \ + b_tmp = rhs.conjugate(); \ + } else \ + if (ConjugateRhs) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.adjoint(); \ + } else { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = rhs.transpose(); \ + } \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ + } \ +\ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif + +/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ + +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \ +{\ +\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + char side='R', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ +\ +/* Set m, n, k */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ +\ +/* Set a, b, c */ \ + if (RhsStorageOrder==RowMajor) uplo='U'; \ + a = _rhs; \ +\ + if (LhsStorageOrder==RowMajor) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ + b_tmp = lhs.adjoint(); \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ + } else b = _lhs; \ +\ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ +\ + } \ +}; + + +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \ +{\ + static void run( \ + Index rows, Index cols, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + char side='R', uplo='L'; \ + BlasIndex m, n, lda, ldb, ldc; \ + const EIGTYPE *a, *b; \ + EIGTYPE beta(1); \ + MatrixX##EIGPREFIX b_tmp; \ + Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \ +\ +/* Set m, n, k */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ +\ +/* Set lda, ldb, ldc */ \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ +\ +/* Set a, b, c */ \ + if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ + Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ + a_tmp = rhs.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else a = _rhs; \ + if (RhsStorageOrder==RowMajor) uplo='U'; \ +\ + if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \ + b = _lhs; } \ + else { \ + if (LhsStorageOrder==ColMajor && ConjugateLhs) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \ + b_tmp = lhs.conjugate(); \ + } else \ + if (ConjugateLhs) { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ + b_tmp = lhs.adjoint(); \ + } else { \ + Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \ + b_tmp = lhs.transpose(); \ + } \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ + } \ +\ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h new file mode 100644 index 000000000..d38fd72b2 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -0,0 +1,262 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H + +namespace Eigen { + +namespace internal { + +/* Optimized selfadjoint matrix * vector product: + * This algorithm processes 2 columns at once that allows to both reduce + * the number of load/stores of the result by a factor 2 and to reduce + * the instruction dependency. + */ + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized> +struct selfadjoint_matrix_vector_product; + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version> +struct selfadjoint_matrix_vector_product + +{ +static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void run( + Index size, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, + Scalar* res, + Scalar alpha); +}; + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( + Index size, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, + Scalar* res, + Scalar alpha) +{ + typedef typename packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { + IsRowMajor = StorageOrder==RowMajor ? 1 : 0, + IsLower = UpLo == Lower ? 1 : 0, + FirstTriangular = IsRowMajor == IsLower + }; + + conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd; + + conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; + + Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; + + Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe; + if (FirstTriangular) + bound = size - bound; + + for (Index j=FirstTriangular ? bound : 0; + j<(FirstTriangular ? size : bound);j+=2) + { + const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; + + Scalar t0 = cjAlpha * rhs[j]; + Packet ptmp0 = pset1<Packet>(t0); + Scalar t1 = cjAlpha * rhs[j+1]; + Packet ptmp1 = pset1<Packet>(t1); + + Scalar t2(0); + Packet ptmp2 = pset1<Packet>(t2); + Scalar t3(0); + Packet ptmp3 = pset1<Packet>(t3); + + Index starti = FirstTriangular ? 0 : j+2; + Index endi = FirstTriangular ? j : size; + Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); + Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); + + res[j] += cjd.pmul(numext::real(A0[j]), t0); + res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); + if(FirstTriangular) + { + res[j] += cj0.pmul(A1[j], t1); + t3 += cj1.pmul(A1[j], rhs[j]); + } + else + { + res[j+1] += cj0.pmul(A0[j+1],t0); + t2 += cj1.pmul(A0[j+1], rhs[j+1]); + } + + for (Index i=starti; i<alignedStart; ++i) + { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); + } + // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up) + // gcc 4.2 does this optimization automatically. + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (Index i=alignedStart; i<alignedEnd; i+=PacketSize) + { + Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize; + Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize; + Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = pload <Packet>(resIt); + + Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); + pstore(resIt,Xi); resIt += PacketSize; + } + for (Index i=alignedEnd; i<endi; i++) + { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); + } + + res[j] += alpha * (t2 + predux(ptmp2)); + res[j+1] += alpha * (t3 + predux(ptmp3)); + } + for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++) + { + const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + + Scalar t1 = cjAlpha * rhs[j]; + Scalar t2(0); + res[j] += cjd.pmul(numext::real(A0[j]), t1); + for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) + { + res[i] += cj0.pmul(A0[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + } + res[j] += alpha * t2; + } +} + +} // end namespace internal + +/*************************************************************************** +* Wrapper to product_selfadjoint_vector +***************************************************************************/ + +namespace internal { + +template<typename Lhs, int LhsMode, typename Rhs> +struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true> +{ + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; + + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + enum { LhsUpLo = LhsMode&(Upper|Lower) }; + + template<typename Dest> + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + { + typedef typename Dest::Scalar ResScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest; + + eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols()); + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + enum { + EvalToDest = (Dest::InnerStrideAtCompileTime==1), + UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1) + }; + + internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest; + internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs; + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + EvalToDest ? dest.data() : static_dest.data()); + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(), + UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data()); + + if(!EvalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + MappedDest(actualDestPtr, dest.size()) = dest; + } + + if(!UseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = rhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs; + } + + + internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, + int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run + ( + lhs.rows(), // size + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + actualRhsPtr, // rhs info + actualDestPtr, // result info + actualAlpha // scale factor + ); + + if(!EvalToDest) + dest = MappedDest(actualDestPtr, dest.size()); + } +}; + +template<typename Lhs, typename Rhs, int RhsMode> +struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false> +{ + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + enum { RhsUpLo = RhsMode&(Upper|Lower) }; + + template<typename Dest> + static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + { + // let's simply transpose the product + Transpose<Dest> destT(dest); + selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false, + Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h new file mode 100644 index 000000000..1238345e3 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -0,0 +1,118 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements selfadjoint matrix-vector multiplication using BLAS +**********************************************************************/ + +// symv/hemv specialization + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> +struct selfadjoint_matrix_vector_product_symv : + selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {}; + +#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \ +template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ +struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \ +static void run( \ + Index size, const Scalar* lhs, Index lhsStride, \ + const Scalar* _rhs, Scalar* res, Scalar alpha) { \ + enum {\ + IsColMajor = StorageOrder==ColMajor \ + }; \ + if (IsColMajor == ConjugateLhs) {\ + selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \ + size, lhs, lhsStride, _rhs, res, alpha); \ + } else {\ + selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \ + size, lhs, lhsStride, _rhs, res, alpha); \ + }\ + } \ +}; \ + +EIGEN_BLAS_SYMV_SPECIALIZE(double) +EIGEN_BLAS_SYMV_SPECIALIZE(float) +EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(scomplex) + +#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ +template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ +struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \ +{ \ +typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\ +\ +static void run( \ +Index size, const EIGTYPE* lhs, Index lhsStride, \ +const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ +{ \ + enum {\ + IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ + IsLower = UpLo == Lower ? 1 : 0 \ + }; \ + BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \ + EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ + char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ + SYMVVector x_tmp; \ + if (ConjugateRhs) { \ + Map<const SYMVVector, 0 > map_x(_rhs,size,1); \ + x_tmp=map_x.conjugate(); \ + x_ptr=x_tmp.data(); \ + } else x_ptr=_rhs; \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ +}\ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h new file mode 100644 index 000000000..a21be8050 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h @@ -0,0 +1,133 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SELFADJOINT_PRODUCT_H +#define EIGEN_SELFADJOINT_PRODUCT_H + +/********************************************************************** +* This file implements a self adjoint product: C += A A^T updating only +* half of the selfadjoint matrix C. +* It corresponds to the level 3 SYRK and level 2 SYR Blas routines. +**********************************************************************/ + +namespace Eigen { + + +template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs> +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) + { + internal::conj_if<ConjRhs> cj; + typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap; + typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType; + for (Index i=0; i<size; ++i) + { + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1))) + += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1))); + } + } +}; + +template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs> +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) + { + selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha); + } +}; + +template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime> +struct selfadjoint_product_selector; + +template<typename MatrixType, typename OtherType, int UpLo> +struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true> +{ + static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef internal::blas_traits<OtherType> OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; + typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { + StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor, + UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1 + }; + internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other; + + ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(), + (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data())); + + if(!UseOtherDirectly) + Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther; + + selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo, + OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex> + ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha); + } +}; + +template<typename MatrixType, typename OtherType, int UpLo> +struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> +{ + static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef internal::blas_traits<OtherType> OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; + typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualOther.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + + + internal::general_matrix_matrix_triangular_product<Index, + Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> + ::run(size, depth, + actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(), + mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); + } +}; + +// high level API + +template<typename MatrixType, unsigned int UpLo> +template<typename DerivedU> +EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha) +{ + selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha); + + return *this; +} + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINT_PRODUCT_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h new file mode 100644 index 000000000..f752a0bf0 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -0,0 +1,94 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H +#define EIGEN_SELFADJOINTRANK2UPTADE_H + +namespace Eigen { + +namespace internal { + +/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu' + * It corresponds to the Level2 syr2 BLAS routine + */ + +template<typename Scalar, typename Index, typename UType, typename VType, int UpLo> +struct selfadjoint_rank2_update_selector; + +template<typename Scalar, typename Index, typename UType, typename VType> +struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower> +{ + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + { + const Index size = u.size(); + for (Index i=0; i<size; ++i) + { + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) += + (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i) + + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i); + } + } +}; + +template<typename Scalar, typename Index, typename UType, typename VType> +struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper> +{ + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + { + const Index size = u.size(); + for (Index i=0; i<size; ++i) + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) += + (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i+1) + + (alpha * numext::conj(v.coeff(i))) * u.head(i+1); + } +}; + +template<bool Cond, typename T> struct conj_expr_if + : conditional<!Cond, const T&, + CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {}; + +} // end namespace internal + +template<typename MatrixType, unsigned int UpLo> +template<typename DerivedU, typename DerivedV> +EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha) +{ + typedef internal::blas_traits<DerivedU> UBlasTraits; + typedef typename UBlasTraits::DirectLinearAccessType ActualUType; + typedef typename internal::remove_all<ActualUType>::type _ActualUType; + typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived()); + + typedef internal::blas_traits<DerivedV> VBlasTraits; + typedef typename VBlasTraits::DirectLinearAccessType ActualVType; + typedef typename internal::remove_all<ActualVType>::type _ActualVType; + typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived()); + + // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and + // vice versa, and take the complex conjugate of all coefficients and vector entries. + + enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; + Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) + * numext::conj(VBlasTraits::extractScalarFactor(v.derived())); + if (IsRowMajor) + actualAlpha = numext::conj(actualAlpha); + + typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType; + typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType; + internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, + (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> + ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); + + return *this; +} + +} // end namespace Eigen + +#endif // EIGEN_SELFADJOINTRANK2UPTADE_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h new file mode 100644 index 000000000..f0c60507a --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -0,0 +1,472 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H + +namespace Eigen { + +namespace internal { + +// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode> +// struct gemm_pack_lhs_triangular +// { +// Matrix<Scalar,mr,mr, +// void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows) +// { +// conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; +// const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); +// int count = 0; +// const int peeled_mc = (rows/mr)*mr; +// for(int i=0; i<peeled_mc; i+=mr) +// { +// for(int k=0; k<depth; k++) +// for(int w=0; w<mr; w++) +// blockA[count++] = cj(lhs(i+w, k)); +// } +// for(int i=peeled_mc; i<rows; i++) +// { +// for(int k=0; k<depth; k++) +// blockA[count++] = cj(lhs(i, k)); +// } +// } +// }; + +/* Optimized triangular matrix * matrix (_TRMM++) product built on top of + * the general matrix matrix product. + */ +template <typename Scalar, typename Index, + int Mode, bool LhsIsTriangular, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder, int ResInnerStride, + int Version = Specialized> +struct product_triangular_matrix_matrix; + +template <typename Scalar, typename Index, + int Mode, bool LhsIsTriangular, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version> +{ + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + product_triangular_matrix_matrix<Scalar, Index, + (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper), + (!LhsIsTriangular), + RhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateRhs, + LhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateLhs, + ColMajor, ResInnerStride> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); + } +}; + +// implements col-major += alpha * op(triangular) * op(general) +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> +{ + + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); +}; + +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> +EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + // strip zeros + Index diagSize = (std::min)(_rows,_depth); + Index rows = IsLower ? _rows : diagSize; + Index depth = IsLower ? diagSize : _depth; + Index cols = _cols; + + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // The small panel size must not be larger than blocking size. + // Usually this should never be the case because SmallPanelWidth^2 is very small + // compared to L2 cache size, but let's be safe: + Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc)); + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a); + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + + for(Index k2=IsLower ? depth : 0; + IsLower ? k2>0 : k2<depth; + IsLower ? k2-=kc : k2+=kc) + { + Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc); + Index actual_k2 = IsLower ? k2-actual_kc : k2; + + // align blocks with the end of the triangular part for trapezoidal lhs + if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows)) + { + actual_kc = rows-k2; + k2 = k2+actual_kc-kc; + } + + pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols); + + // the selected lhs's panel has to be split in three different parts: + // 1 - the part which is zero => skip it + // 2 - the diagonal block => special kernel + // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP + + // the block diagonal, if any: + if(IsLower || actual_k2<rows) + { + // for each small vertical panels of lhs + for (Index k1=0; k1<actual_kc; k1+=panelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth); + Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; + Index startBlock = actual_k2+k1; + Index blockBOffset = k1; + + // => GEBP with the micro triangular block + // The trick is to pack this micro block while filling the opposite triangular part with zeros. + // To this end we do an extra triangular copy to a small temporary buffer + for (Index k=0;k<actualPanelWidth;++k) + { + if (SetDiag) + triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k); + for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) + triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); + } + pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth); + + gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB, + actualPanelWidth, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); + + // GEBP with remaining micro panel + if (lengthTarget>0) + { + Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; + + pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); + + gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, + lengthTarget, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + // the part below (lower case) or above (upper case) the diagonal => GEPP + { + Index start = IsLower ? k2 : 0; + Index end = IsLower ? rows : (std::min)(actual_k2,rows); + for(Index i2=start; i2<end; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,end)-i2; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); + + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); + } + } + } + } + +// implements col-major += alpha * op(general) * op(triangular) +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> +{ + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); +}; + +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> +EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* _res, Index resIncr, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) + { + const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar); + // strip zeros + Index diagSize = (std::min)(_cols,_depth); + Index rows = _rows; + Index depth = IsLower ? _depth : diagSize; + Index cols = IsLower ? diagSize : _cols; + + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride, resIncr); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar); + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + internal::constructor_without_unaligned_array_assert a; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a); + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + + for(Index k2=IsLower ? 0 : depth; + IsLower ? k2<depth : k2>0; + IsLower ? k2+=kc : k2-=kc) + { + Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); + Index actual_k2 = IsLower ? k2 : k2-actual_kc; + + // align blocks with the end of the triangular part for trapezoidal rhs + if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols)) + { + actual_kc = cols-k2; + k2 = actual_k2 + actual_kc - kc; + } + + // remaining size + Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; + // size of the triangular part + Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; + + Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar)); + + pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); + + // pack the triangular part of the rhs padding the unrolled blocks with zeros + if(ts>0) + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + // general part + pack_rhs_panel(blockB+j2*actual_kc, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), + panelLength, actualPanelWidth, + actual_kc, panelOffset); + + // append the triangular part via a temporary buffer + for (Index j=0;j<actualPanelWidth;++j) + { + if (SetDiag) + triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j); + for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k) + triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j); + } + + pack_rhs_panel(blockB+j2*actual_kc, + RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), + actualPanelWidth, actualPanelWidth, + actual_kc, j2); + } + } + + for (Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(mc,rows-i2); + pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); + + // triangular kernel + if(ts>0) + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; + Index blockOffset = IsLower ? j2 : 0; + + gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + alpha, + actual_kc, actual_kc, // strides + blockOffset, blockOffset);// offsets + } + } + gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), + blockA, geb, actual_mc, actual_kc, rs, + alpha, + -1, -1, 0, 0); + } + } + } + +/*************************************************************************** +* Wrapper to product_triangular_matrix_matrix +***************************************************************************/ + +} // end namespace internal + +namespace internal { +template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> +struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> +{ + template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) + { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); + + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; + + enum { IsLower = (Mode&Lower) == Lower }; + Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols()); + Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows()); + Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) + : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); + + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); + + internal::product_triangular_matrix_matrix<Scalar, Index, + Mode, LhsIsTriangular, + (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> + ::run( + stripedRows, stripedCols, stripedDepth, // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info + actualAlpha, blocking + ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h new file mode 100644 index 000000000..a98d12e4a --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -0,0 +1,317 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix * matrix product functionality based on ?TRMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + + +template <typename Scalar, typename Index, + int Mode, bool LhsIsTriangular, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder> +struct product_triangular_matrix_matrix_trmm : + product_triangular_matrix_matrix<Scalar,Index,Mode, + LhsIsTriangular,LhsStorageOrder,ConjugateLhs, + RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {}; + + +// try to go to BLAS specialization +#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ +template <typename Index, int Mode, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \ + static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ + product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \ + LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \ + RhsStorageOrder, ConjugateRhs, ColMajor>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + } \ +}; + +EIGEN_BLAS_TRMM_SPECIALIZE(double, true) +EIGEN_BLAS_TRMM_SPECIALIZE(double, false) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(float, true) +EIGEN_BLAS_TRMM_SPECIALIZE(float, false) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) + +// implements col-major += alpha * op(triangular) * op(general) +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, int Mode, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ + LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \ + }; \ +\ + static void run( \ + Index _rows, Index _cols, Index _depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \ + { \ + Index diagSize = (std::min)(_rows,_depth); \ + Index rows = IsLower ? _rows : diagSize; \ + Index depth = IsLower ? diagSize : _depth; \ + Index cols = _cols; \ +\ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ +\ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ + if (rows != depth) { \ +\ + /* FIXME handle mkl_domain_get_max_threads */ \ + /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\ +\ + if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ + /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ + product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ + /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ + } else { \ + /* Make sense to call GEMM */ \ + Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ + gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ + general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \ +\ + /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ + } \ + return; \ + } \ + char side = 'L', transa, uplo, diag = 'N'; \ + EIGTYPE *b; \ + const EIGTYPE *a; \ + BlasIndex m, n, lda, ldb; \ +\ +/* Set m, n */ \ + m = convert_index<BlasIndex>(diagSize); \ + n = convert_index<BlasIndex>(cols); \ +\ +/* Set trans */ \ + transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set b, ldb */ \ + Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \ + MatrixX##EIGPREFIX b_tmp; \ +\ + if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ +\ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixLhs a_tmp; \ +\ + if ((conjA!=0) || (SetDiag==0)) { \ + if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \ + if (IsZeroDiag) \ + a_tmp.diagonal().setZero(); \ + else if (IsUnitDiag) \ + a_tmp.diagonal().setOnes();\ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else { \ + a = _lhs; \ + lda = convert_index<BlasIndex>(lhsStride); \ + } \ + /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ +/* call ?trmm*/ \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ +\ +/* Add op(a_triangular)*b into res*/ \ + Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ + res_tmp=res_tmp+b_tmp; \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMM_L(double, double, d, dtrmm) +EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm) +EIGEN_BLAS_TRMM_L(float, float, f, strmm) +EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm) +#else +EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_) +EIGEN_BLAS_TRMM_L(float, float, f, strmm_) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_) +#endif + +// implements col-major += alpha * op(general) * op(triangular) +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ +template <typename Index, int Mode, \ + int LhsStorageOrder, bool ConjugateLhs, \ + int RhsStorageOrder, bool ConjugateRhs> \ +struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ + LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper, \ + conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \ + }; \ +\ + static void run( \ + Index _rows, Index _cols, Index _depth, \ + const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsStride, \ + EIGTYPE* res, Index resStride, \ + EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \ + { \ + Index diagSize = (std::min)(_cols,_depth); \ + Index rows = _rows; \ + Index depth = IsLower ? _depth : diagSize; \ + Index cols = IsLower ? diagSize : _cols; \ +\ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ +\ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ + if (cols != depth) { \ +\ + int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \ +\ + if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ + /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ + product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ + /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ + } else { \ + /* Make sense to call GEMM */ \ + Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ + MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ + gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ + general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \ +\ + /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ + } \ + return; \ + } \ + char side = 'R', transa, uplo, diag = 'N'; \ + EIGTYPE *b; \ + const EIGTYPE *a; \ + BlasIndex m, n, lda, ldb; \ +\ +/* Set m, n */ \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(diagSize); \ +\ +/* Set trans */ \ + transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ +\ +/* Set b, ldb */ \ + Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \ + MatrixX##EIGPREFIX b_tmp; \ +\ + if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ + b = b_tmp.data(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ +\ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \ + MatrixRhs a_tmp; \ +\ + if ((conjA!=0) || (SetDiag==0)) { \ + if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \ + if (IsZeroDiag) \ + a_tmp.diagonal().setZero(); \ + else if (IsUnitDiag) \ + a_tmp.diagonal().setOnes();\ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else { \ + a = _rhs; \ + lda = convert_index<BlasIndex>(rhsStride); \ + } \ + /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ +/* call ?trmm*/ \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ +\ +/* Add op(a_triangular)*b into res*/ \ + Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ + res_tmp=res_tmp+b_tmp; \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMM_R(double, double, d, dtrmm) +EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm) +EIGEN_BLAS_TRMM_R(float, float, f, strmm) +EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm) +#else +EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_) +EIGEN_BLAS_TRMM_R(float, float, f, strmm_) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_) +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h new file mode 100644 index 000000000..76bfa159c --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h @@ -0,0 +1,350 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H +#define EIGEN_TRIANGULARMATRIXVECTOR_H + +namespace Eigen { + +namespace internal { + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized> +struct triangular_matrix_vector_product; + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> +struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag, + HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag + }; + static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha); +}; + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> +EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> + ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + Index size = (std::min)(_rows,_cols); + Index rows = IsLower ? _rows : (std::min)(_rows,_cols); + Index cols = IsLower ? (std::min)(_rows,_cols) : _cols; + + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); + + typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; + const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); + + typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; + ResMap res(_res,rows); + + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + + for (Index pi=0; pi<size; pi+=PanelWidth) + { + Index actualPanelWidth = (std::min)(PanelWidth, size-pi); + for (Index k=0; k<actualPanelWidth; ++k) + { + Index i = pi + k; + Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi; + Index r = IsLower ? actualPanelWidth-k : k+1; + if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0) + res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? rows - pi - actualPanelWidth : pi; + if (r>0) + { + Index s = IsLower ? pi+actualPanelWidth : 0; + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( + r, actualPanelWidth, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), + &res.coeffRef(s), resIncr, alpha); + } + } + if((!IsLower) && cols>size) + { + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( + rows, cols-size, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), + _res, resIncr, alpha); + } + } + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version> +struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version> +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag, + HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag + }; + static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); +}; + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version> +EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version> + ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + Index diagSize = (std::min)(_rows,_cols); + Index rows = IsLower ? _rows : diagSize; + Index cols = IsLower ? diagSize : _cols; + + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); + + typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap; + const RhsMap rhs(_rhs,cols); + typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); + + typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; + ResMap res(_res,rows,InnerStride<>(resIncr)); + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + + for (Index pi=0; pi<diagSize; pi+=PanelWidth) + { + Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi); + for (Index k=0; k<actualPanelWidth; ++k) + { + Index i = pi + k; + Index s = IsLower ? pi : ((HasUnitDiag||HasZeroDiag) ? i+1 : i); + Index r = IsLower ? k+1 : actualPanelWidth-k; + if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0) + res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum(); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? pi : cols - pi - actualPanelWidth; + if (r>0) + { + Index s = IsLower ? 0 : pi + actualPanelWidth; + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( + actualPanelWidth, r, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), + &res.coeffRef(pi), resIncr, alpha); + } + } + if(IsLower && rows>diagSize) + { + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( + rows-diagSize, cols, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), + &res.coeffRef(diagSize), resIncr, alpha); + } + } + +/*************************************************************************** +* Wrapper to product_triangular_vector +***************************************************************************/ + +template<int Mode,int StorageOrder> +struct trmv_selector; + +} // end namespace internal + +namespace internal { + +template<int Mode, typename Lhs, typename Rhs> +struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true> +{ + template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha) + { + eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols()); + + internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha); + } +}; + +template<int Mode, typename Lhs, typename Rhs> +struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false> +{ + template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha) + { + eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols()); + + Transpose<Dest> dstT(dst); + internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower), + (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor> + ::run(rhs.transpose(),lhs.transpose(), dstT, alpha); + } +}; + +} // end namespace internal + +namespace internal { + +// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. + +template<int Mode> struct trmv_selector<Mode,ColMajor> +{ + template<typename Lhs, typename Rhs, typename Dest> + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + typedef typename Dest::RealScalar RealScalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + + typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest; + + typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); + + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; + + enum { + // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 + // on, the other hand it is good for the cache to pack the vector anyways... + EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex), + MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal + }; + + gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest; + + bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); + bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + evalToDest ? dest.data() : static_dest.data()); + + if(!evalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + if(!alphaIsCompatible) + { + MappedDest(actualDestPtr, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDestPtr, dest.size()) = dest; + } + + internal::triangular_matrix_vector_product + <Index,Mode, + LhsScalar, LhsBlasTraits::NeedToConjugate, + RhsScalar, RhsBlasTraits::NeedToConjugate, + ColMajor> + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhs.data(),actualRhs.innerStride(), + actualDestPtr,1,compatibleAlpha); + + if (!evalToDest) + { + if(!alphaIsCompatible) + dest += actualAlpha * MappedDest(actualDestPtr, dest.size()); + else + dest = MappedDest(actualDestPtr, dest.size()); + } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } + } +}; + +template<int Mode> struct trmv_selector<Mode,RowMajor> +{ + template<typename Lhs, typename Rhs, typename Dest> + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); + typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); + + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; + + enum { + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + }; + + gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs; + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(), + DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data()); + + if(!DirectlyUseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = actualRhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; + } + + internal::triangular_matrix_vector_product + <Index,Mode, + LhsScalar, LhsBlasTraits::NeedToConjugate, + RhsScalar, RhsBlasTraits::NeedToConjugate, + RowMajor> + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhsPtr,1, + dest.data(),dest.innerStride(), + actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULARMATRIXVECTOR_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h new file mode 100644 index 000000000..3d47a2b94 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -0,0 +1,255 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix-vector product functionality based on ?TRMV. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H + +namespace Eigen { + +namespace internal { + +/********************************************************************** +* This file implements triangular matrix-vector multiplication using BLAS +**********************************************************************/ + +// trmv/hemv specialization + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder> +struct triangular_matrix_vector_product_trmv : + triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {}; + +#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ +template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ +struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + } \ +}; \ +template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ +struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \ + static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ + const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + } \ +}; + +EIGEN_BLAS_TRMV_SPECIALIZE(double) +EIGEN_BLAS_TRMV_SPECIALIZE(float) +EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) + +// implements col-major: res += alpha * op(triangular) * vector +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ +template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ +struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper \ + }; \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + { \ + if (ConjLhs || IsZeroDiag) { \ + triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + return; \ + }\ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ +\ + typedef VectorX##EIGPREFIX VectorRhs; \ + EIGTYPE *x, *y;\ +\ +/* Set x*/ \ + Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + VectorRhs x_tmp; \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ +\ +/* Square part handling */\ +\ + char trans, uplo, diag; \ + BlasIndex m, n, lda, incx, incy; \ + EIGTYPE const *a; \ + EIGTYPE beta(1); \ +\ +/* Set m, n */ \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ + incx = 1; \ + incy = convert_index<BlasIndex>(resIncr); \ +\ +/* Set uplo, trans and diag*/ \ + trans = 'N'; \ + uplo = IsLower ? 'L' : 'U'; \ + diag = IsUnitDiag ? 'U' : 'N'; \ +\ +/* call ?TRMV*/ \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ +\ +/* Add op(a_tr)rhs into res*/ \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ + if (size<(std::max)(rows,cols)) { \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ + if (size<rows) { \ + y = _res + size*resIncr; \ + a = _lhs + size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ + } \ + else { \ + x += size; \ + y = _res; \ + a = _lhs + size*lda; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ + } \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + } \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif + +// implements row-major: res += alpha * op(triangular) * vector +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ +template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ +struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + LowUp = IsLower ? Lower : Upper \ + }; \ + static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + { \ + if (IsZeroDiag) { \ + triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + return; \ + }\ + Index size = (std::min)(_rows,_cols); \ + Index rows = IsLower ? _rows : size; \ + Index cols = IsLower ? size : _cols; \ +\ + typedef VectorX##EIGPREFIX VectorRhs; \ + EIGTYPE *x, *y;\ +\ +/* Set x*/ \ + Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + VectorRhs x_tmp; \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ +\ +/* Square part handling */\ +\ + char trans, uplo, diag; \ + BlasIndex m, n, lda, incx, incy; \ + EIGTYPE const *a; \ + EIGTYPE beta(1); \ +\ +/* Set m, n */ \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ + incx = 1; \ + incy = convert_index<BlasIndex>(resIncr); \ +\ +/* Set uplo, trans and diag*/ \ + trans = ConjLhs ? 'C' : 'T'; \ + uplo = IsLower ? 'U' : 'L'; \ + diag = IsUnitDiag ? 'U' : 'N'; \ +\ +/* call ?TRMV*/ \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ +\ +/* Add op(a_tr)rhs into res*/ \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ + if (size<(std::max)(rows,cols)) { \ + if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ + x = x_tmp.data(); \ + if (size<rows) { \ + y = _res + size*resIncr; \ + a = _lhs + size*lda; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ + } \ + else { \ + x += size; \ + y = _res; \ + a = _lhs + size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ + } \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + } \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif + +} // end namespase internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h new file mode 100644 index 000000000..6d879ba00 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -0,0 +1,337 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H + +namespace Eigen { + +namespace internal { + +// if the rhs is row major, let's transpose the product +template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride> +{ + static void run( + Index size, Index cols, + const Scalar* tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride, + level3_blocking<Scalar,Scalar>& blocking) + { + triangular_solve_matrix< + Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, + (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), + NumTraits<Scalar>::IsComplex && Conjugate, + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride> + ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking); + } +}; + +/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left + */ +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride> +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride, + level3_blocking<Scalar,Scalar>& blocking); +}; +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride, + level3_blocking<Scalar,Scalar>& blocking) + { + Index cols = otherSize; + + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride, otherIncr); + + typedef gebp_traits<Scalar,Scalar> Traits; + + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(size,blocking.mc()); // cache block size along the M direction + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + conj_if<Conjugate> conj; + gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs; + + // the goal here is to subdivise the Rhs panels such that we keep some cache + // coherence when accessing the rhs elements + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0; + subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); + + for(Index k2=IsLower ? 0 : size; + IsLower ? k2<size : k2>0; + IsLower ? k2+=kc : k2-=kc) + { + const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc); + + // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel, + // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into + // A11 (the triangular part) and A21 the remaining rectangular part. + // Then the high level algorithm is: + // - B = R1 => general block copy (done during the next step) + // - R1 = A11^-1 B => tricky part + // - update B from the new R1 => actually this has to be performed continuously during the above step + // - R2 -= A21 * B => GEPP + + // The tricky part: compute R1 = A11^-1 B while updating B from R1 + // The idea is to split A11 into multiple small vertical panels. + // Each panel can be split into a small triangular part T1k which is processed without optimization, + // and the remaining small part T2k which is processed using gebp with appropriate block strides + for(Index j2=0; j2<cols; j2+=subcols) + { + Index actual_cols = (std::min)(cols-j2,subcols); + // for each small vertical panels [T1k^T, T2k^T]^T of lhs + for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); + // tr solve + for (Index k=0; k<actualPanelWidth; ++k) + { + // TODO write a small kernel handling this (can be shared with trsv) + Index i = IsLower ? k2+k1+k : k2-k1-k-1; + Index rs = actualPanelWidth - k - 1; // remaining size + Index s = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1) + : IsLower ? i+1 : i-rs; + + Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i)); + for (Index j=j2; j<j2+actual_cols; ++j) + { + if (TriStorageOrder==RowMajor) + { + Scalar b(0); + const Scalar* l = &tri(i,s); + typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j); + for (Index i3=0; i3<k; ++i3) + b += conj(l[i3]) * r(i3); + + other(i,j) = (other(i,j) - b)*a; + } + else + { + Scalar& otherij = other(i,j); + otherij *= a; + Scalar b = otherij; + typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j); + typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i); + for (Index i3=0;i3<rs;++i3) + r(i3) -= b * conj(l(i3)); + } + } + } + + Index lengthTarget = actual_kc-k1-actualPanelWidth; + Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth; + Index blockBOffset = IsLower ? k1 : lengthTarget; + + // update the respective rows of B from other + pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset); + + // GEBP + if (lengthTarget>0) + { + Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; + + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); + + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + + // R2 -= A21 * B => GEPP + { + Index start = IsLower ? k2+kc : 0; + Index end = IsLower ? size : k2-kc; + for(Index i2=start; i2<end; i2+=mc) + { + const Index actual_mc = (std::min)(mc,end-i2); + if (actual_mc>0) + { + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); + + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + } + } + } + } + } + +/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right + */ +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride> +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride, + level3_blocking<Scalar,Scalar>& blocking); +}; +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherIncr, Index otherStride, + level3_blocking<Scalar,Scalar>& blocking) + { + Index rows = otherSize; + typedef typename NumTraits<Scalar>::Real RealScalar; + + typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; + LhsMapper lhs(_other, otherStride, otherIncr); + RhsMapper rhs(_tri, triStride); + + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + RhsStorageOrder = TriStorageOrder, + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + conj_if<Conjugate> conj; + gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel; + + for(Index k2=IsLower ? size : 0; + IsLower ? k2>0 : k2<size; + IsLower ? k2-=kc : k2+=kc) + { + const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc); + Index actual_k2 = IsLower ? k2-actual_kc : k2 ; + + Index startPanel = IsLower ? 0 : k2+actual_kc; + Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; + Scalar* geb = blockB+actual_kc*actual_kc; + + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); + + // triangular packing (we only pack the panels off the diagonal, + // neglecting the blocks overlapping the diagonal + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + + if (panelLength>0) + pack_rhs_panel(blockB+j2*actual_kc, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), + panelLength, actualPanelWidth, + actual_kc, panelOffset); + } + } + + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(mc,rows-i2); + + // triangular solver kernel + { + // for each small block of the diagonal (=> vertical panels of rhs) + for (Index j2 = IsLower + ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth) + : Index(SmallPanelWidth))) + : 0; + IsLower ? j2>=0 : j2<actual_kc; + IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index absolute_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2; + + // GEBP + if(panelLength>0) + { + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + Scalar(-1), + actual_kc, actual_kc, // strides + panelOffset, panelOffset); // offsets + } + + // unblocked triangular solve + for (Index k=0; k<actualPanelWidth; ++k) + { + Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k; + + typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j); + for (Index k3=0; k3<k; ++k3) + { + Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j)); + typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3); + for (Index i=0; i<actual_mc; ++i) + r(i) -= a(i) * b; + } + if((Mode & UnitDiag)==0) + { + Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j)); + for (Index i=0; i<actual_mc; ++i) + r(i) *= inv_rjj; + } + } + + // pack the just computed part of lhs to A + pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2), + actualPanelWidth, actual_mc, + actual_kc, j2); + } + } + + if (rs>0) + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, + actual_mc, actual_kc, rs, Scalar(-1), + -1, -1, 0, 0); + } + } + } + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h new file mode 100644 index 000000000..621194ce6 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -0,0 +1,167 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ******************************************************************************** + * Content : Eigen bindings to BLAS F77 + * Triangular matrix * matrix product functionality based on ?TRMM. + ******************************************************************************** +*/ + +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H + +namespace Eigen { + +namespace internal { + +// implements LeftSide op(triangular)^-1 * general +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \ +template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ +struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ + }; \ + static void run( \ + Index size, Index otherSize, \ + const EIGTYPE* _tri, Index triStride, \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ + BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ + char side = 'L', uplo, diag='N', transa; \ + /* Set alpha_ */ \ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ +\ + const EIGTYPE *a; \ +/* Set trans */ \ + transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \ + Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \ + MatrixTri a_tmp; \ +\ + if (conjA) { \ + a_tmp = tri.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else { \ + a = _tri; \ + lda = convert_index<BlasIndex>(triStride); \ + } \ + if (IsUnitDiag) diag='U'; \ +/* call ?trsm*/ \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRSM_L(double, double, dtrsm) +EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm) +EIGEN_BLAS_TRSM_L(float, float, strsm) +EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm) +#else +EIGEN_BLAS_TRSM_L(double, double, dtrsm_) +EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_) +EIGEN_BLAS_TRSM_L(float, float, strsm_) +EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_) +#endif + +// implements RightSide general * op(triangular)^-1 +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \ +template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ +struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \ +{ \ + enum { \ + IsLower = (Mode&Lower) == Lower, \ + IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \ + IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \ + conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \ + }; \ + static void run( \ + Index size, Index otherSize, \ + const EIGTYPE* _tri, Index triStride, \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ + { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ + BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ + char side = 'R', uplo, diag='N', transa; \ + /* Set alpha_ */ \ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ +\ + const EIGTYPE *a; \ +/* Set trans */ \ + transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \ +/* Set uplo */ \ + uplo = IsLower ? 'L' : 'U'; \ + if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \ +/* Set a, lda */ \ + typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \ + Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \ + MatrixTri a_tmp; \ +\ + if (conjA) { \ + a_tmp = tri.conjugate(); \ + a = a_tmp.data(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ + } else { \ + a = _tri; \ + lda = convert_index<BlasIndex>(triStride); \ + } \ + if (IsUnitDiag) diag='U'; \ +/* call ?trsm*/ \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + /*std::cout << "TRMS_L specialization!\n";*/ \ + } \ +}; + +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRSM_R(double, double, dtrsm) +EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm) +EIGEN_BLAS_TRSM_R(float, float, strsm) +EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm) +#else +EIGEN_BLAS_TRSM_R(double, double, dtrsm_) +EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_) +EIGEN_BLAS_TRSM_R(float, float, strsm_) +EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_) +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h new file mode 100644 index 000000000..647317016 --- /dev/null +++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h @@ -0,0 +1,148 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H +#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H + +namespace Eigen { + +namespace internal { + +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder> +{ + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft, + ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag), + Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor + >::run(size, _lhs, lhsStride, rhs); + } +}; + +// forward and backward substitution, row-major, rhs is a vector +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; + + typename internal::conditional< + Conjugate, + const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, + const LhsMap&> + ::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + for(Index pi=IsLower ? 0 : size; + IsLower ? pi<size : pi>0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + + Index r = IsLower ? pi : size - pi; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slightly faster at runtime + Index startRow = IsLower ? pi : pi-actualPanelWidth; + Index startCol = IsLower ? 0 : pi; + + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run( + actualPanelWidth, r, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), + rhs + startRow, 1, + RhsScalar(-1)); + } + + for(Index k=0; k<actualPanelWidth; ++k) + { + Index i = IsLower ? pi+k : pi-k-1; + Index s = IsLower ? pi : i+1; + if (k>0) + rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); + + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) + rhs[i] /= cjLhs(i,i); + } + } + } +}; + +// forward and backward substitution, column-major, rhs is a vector +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor> +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; + typename internal::conditional<Conjugate, + const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, + const LhsMap& + >::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + for(Index pi=IsLower ? 0 : size; + IsLower ? pi<size : pi>0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + Index startBlock = IsLower ? pi : pi-actualPanelWidth; + Index endBlock = IsLower ? pi + actualPanelWidth : 0; + + for(Index k=0; k<actualPanelWidth; ++k) + { + Index i = IsLower ? pi+k : pi-k-1; + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } + } + Index r = IsLower ? size - endBlock : startBlock; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slightly faster at runtime + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( + r, actualPanelWidth, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), + rhs+endBlock, 1, RhsScalar(-1)); + } + } + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H |